In [47]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [48]:
data = pd.read_csv('screen_time.csv')
data.head()

Unnamed: 0,Age,Gender,Screen Time Type,Day Type,Average Screen Time (hours),Sample Size
0,5,Male,Educational,Weekday,0.44,500
1,5,Male,Recreational,Weekday,1.11,500
2,5,Male,Total,Weekday,1.55,500
3,5,Male,Educational,Weekend,0.5,500
4,5,Male,Recreational,Weekend,1.44,500


A couple of notes:
- There are a lot of repeated entries. This is because Educational and Recreation sum together to be Total. If we're just trying to predict Average Screen time, we should just take the total columns.
- Gender and Day Type are non-numeric but can easily be mapped to 0,1,2 etc. This is called One-Hot Encoding

In [49]:
# One-hot encode the categorical columns
data = data[data['Screen Time Type'] == 'Total']
data.reset_index(drop=True, inplace=True)

data_encoded = pd.get_dummies(data, columns=['Gender', 'Day Type'], drop_first=True)
print(data_encoded.columns)

data_encoded.head()

Index(['Age', 'Screen Time Type', 'Average Screen Time (hours)', 'Sample Size',
       'Gender_Male', 'Gender_Other/Prefer not to say', 'Day Type_Weekend'],
      dtype='object')


Unnamed: 0,Age,Screen Time Type,Average Screen Time (hours),Sample Size,Gender_Male,Gender_Other/Prefer not to say,Day Type_Weekend
0,5,Total,1.55,500,True,False,False
1,5,Total,1.93,500,True,False,True
2,5,Total,1.45,500,False,False,False
3,5,Total,1.9,500,False,False,True
4,5,Total,1.5,500,False,True,False


In [50]:
# the columns like "Sample Size and Screen Time Type" are no longer useful. We can just drop them
data_encoded.drop(columns=['Sample Size', 'Screen Time Type'], inplace=True)
data_encoded.head()

Unnamed: 0,Age,Average Screen Time (hours),Gender_Male,Gender_Other/Prefer not to say,Day Type_Weekend
0,5,1.55,True,False,False
1,5,1.93,True,False,True
2,5,1.45,False,False,False
3,5,1.9,False,False,True
4,5,1.5,False,True,False


In [52]:
X = data_encoded.drop(columns=['Average Screen Time (hours)'])
y = data_encoded['Average Screen Time (hours)']

# train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("X:")
print(X.head())
print("\nY:")
print(y.head())

X:
   Age  Gender_Male  Gender_Other/Prefer not to say  Day Type_Weekend
0    5         True                           False             False
1    5         True                           False              True
2    5        False                           False             False
3    5        False                           False              True
4    5        False                            True             False

Y:
0    1.55
1    1.93
2    1.45
3    1.90
4    1.50
Name: Average Screen Time (hours), dtype: float64


In [None]:
# I just learned about feature scaling, lets give it a try and see if MSe improves for the LR model.
# it seems to be a preprocessing technique
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# we should try linear regression
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train_scaled, y_train)
y_pred = model.predict(X_test_scaled)

# lets see how well the model performed
from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R^2 Score: {r2}")

Mean Squared Error: 0.05342832298886539
R^2 Score: 0.9883074754058151
