Before you turn this problem in, make sure everything runs as expected. First, **restart the kernel** (in the menubar, select Kernel $\rightarrow$ Restart) and then **run all cells** (in the menubar, select Cell $\rightarrow$ Run All).

Make sure you fill in any place that says `YOUR CODE HERE` or "YOUR ANSWER HERE", as well as your name and collaborators below:

In [1]:
NAME = "Emily Huang"
COLLABORATORS = ""

---

Before you turn this problem in, make sure everything runs as expected. First, **restart the kernel** (in the menubar, select Kernel $\rightarrow$ Restart) and then **run all cells** (in the menubar, select Cell $\rightarrow$ Run All).

Make sure you fill in any place that says `YOUR CODE HERE` or "YOUR ANSWER HERE", as well as your name and collaborators below:

In [2]:
NAME = "Emily Huang"
COLLABORATORS = ""

In [3]:
pip install pymongo

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [None]:
import pandas as pd
import numpy as np
import sqlite3
import pymongo
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, classification_report, r2_score, accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import seaborn as sns

# I created this function to generate simulated data for the project
def create_sleep_screen_data(num_records=1000):
# Used a seed
    np.random.seed(42)
# Generated dates for the data
    time_range = pd.date_range(start='2023-01-01', periods=num_records, freq='D')
# Created screen time data 
    screen_hours_day = np.random.normal(4, 2, num_records)
# Made screen time data 0-12 hours for it to be more realistic
    screen_hours_day = np.clip(screen_hours_day, 0, 12)
# Made sleep quality score 
    base_sleep_quality = np.random.uniform(5, 10, num_records)
# Sleep quality potential adjustments based on screen time
    screen_impact = np.random.uniform(-1, -0.5, num_records) * np.clip(screen_hours_day, 0, 8)
    quality_sleep = np.clip(base_sleep_quality + screen_impact, 1, 10)
# Give a device category to each 
    device_categories = np.random.choice(['Phone', 'Tablet', 'Computer', 'TV'], num_records)

# Get ones with bad sleep
    is_poor_sleep = (quality_sleep < 5).astype(int)

# Make dataframe 
    data = pd.DataFrame({
        'ScreenTime': screen_hours_day,
        'SleepQuality': quality_sleep,
        'DeviceType': device_categories,
        'Timestamp': np.random.choice(time_range, size=num_records, replace=True),
        'PoorSleep': is_poor_sleep
    })
    return data
# I added this function to load the data into a collection and try out MongoDB
# I made sure in case it didn't work I could switch back to SQLite with no issues
# I wanted to test myself to try at the very least
def setup_mongodb(data):
    try:
# Connects to mongodb and converts data to dictionaries and puts it in MongoDB
        mongo_client = pymongo.MongoClient()
        mongo_client.server_info()
        db = mongo_client["sleep_screen_database"]
        sleep_screen_collection = db["sleep_screen_collection"]

        data_records = data.to_dict('records')
        sleep_screen_collection.delete_many({})
        sleep_screen_collection.insert_many(data_records)

        return mongo_client, db, sleep_screen_collection
# Put these in in case it didn't work it would handle the errors
    except pymongo.errors.ServerSelectionTimeoutError:
        return None, None, None
    except Exception:
        return None, None, None

# Created SQLite database and puts data 
def setup_database(data, db_name='sleep_screen_data.db'):
        sql_connection = sqlite3.connect(db_name)
        data.to_sql('SleepScreen', sql_connection, if_exists='replace', index=False)
        sql_connection.close()
# This will run analysis queries and print resultss
def perform_analysis(db_name='sleep_screen_data.db'):
        sql_connection = sqlite3.connect(db_name)
# Gets all the data from sleepscreen 
        sleep_data_total =  pd.read_sql('SELECT * FROM SleepScreen', sql_connection)
# Query for average screen time and sleep quality per device
        device_screen_time_query = '''
        SELECT
            DeviceType,
            AVG(ScreenTime) AS AvgScreenTime,
            AVG(SleepQuality) AS AvgSleepQuality,
            MAX(ScreenTime) AS MaxScreenTime,
            MIN(ScreenTime) AS MinScreenTime
        FROM SleepScreen
        GROUP BY DeviceType
        '''
        avg_data_devices = pd.read_sql(device_screen_time_query, sql_connection)
# Query for correlation coefficients by device 
        correlation_by_device_query = '''
        WITH DeviceStats AS (
            SELECT
                DeviceType,
                AVG(ScreenTime) AS AvgScreenTime,
                AVG(SleepQuality) AS AvgSleepQuality,
                SUM((ScreenTime - (SELECT AVG(ScreenTime) FROM SleepScreen WHERE DeviceType = SleepScreen.DeviceType)) *
                    (ScreenTime - (SELECT AVG(ScreenTime) FROM SleepScreen WHERE DeviceType = SleepScreen.DeviceType))) /
                    COUNT(*) AS VarScreenTime,
                SUM((SleepQuality - (SELECT AVG(SleepQuality) FROM SleepScreen WHERE DeviceType = SleepScreen.DeviceType)) *
                    (SleepQuality - (SELECT AVG(SleepQuality) FROM SleepScreen WHERE DeviceType = SleepScreen.DeviceType))) /
                    COUNT(*) AS VarSleepQuality,
                SUM((ScreenTime - (SELECT AVG(ScreenTime) FROM SleepScreen WHERE DeviceType = SleepScreen.DeviceType)) *
                    (SleepQuality - (SELECT AVG(SleepQuality) FROM SleepScreen WHERE DeviceType = SleepScreen.DeviceType))) /
                    COUNT(*) AS Covariance
            FROM SleepScreen
            GROUP BY DeviceType
        )
        SELECT
            DeviceType,
            ROUND(
                Covariance / (SQRT(VarScreenTime) * SQRT(VarSleepQuality)),
                4
            ) AS CorrelationCoefficient
        FROM DeviceStats;
        '''

        device_correlation = pd.read_sql(correlation_by_device_query, sql_connection)
# Query for monthly trends
        time_series_query= '''
        SELECT
            strftime('%Y-%m', Timestamp) AS Month,
            ROUND(AVG(SleepQuality), 2) AS AVGSleepQuality,
            ROUND(AVG(ScreenTime), 2) AS AVGScreenTime
        FROM SleepScreen
        GROUP BY Month
        ORDER BY Month
        '''
        trends_monthly = pd.read_sql(time_series_query, sql_connection)
# Prints all the findings from queries
        print("Device Screen Time and Sleep Quality:")
        print(avg_data_devices)
        print("\nScreen and Sleep Quality Correlation by Device:")
        print(device_correlation)
        print("\nMonthly Sleep Quality and Screen Time Trends:")
        print(trends_monthly)

        sql_connection.close()
        return sleep_data_total, trends_monthly
# I implemented more complex queries to strengthen my components required in the prompt
# This one is to rank the four devices based on sleep quality
def rank_devices(db_name ='sleep_screen_data.db'):
    connection = sqlite3.connect(db_name)
    explore_query = '''
        SELECT
            DeviceType,
            RANK() OVER (ORDER BY AVG(SleepQuality) DESC) AS SleepQualityRank
        FROM SleepScreen
        GROUP BY DeviceType
        ORDER BY SleepQualityRank
        '''
    explore_query = pd.read_sql(explore_query, connection)
    print("Sleep Quality Rank by Device:")
    print(explore_query.to_string(index=False))

# I added this one to analyze the sleep quality and screen time per month, I thought it could be a good addition into long term trends
def sleepq_by_month(db_name='sleep_screen_data.db'):
    connection = sqlite3.connect(db_name)
    month_query = '''
    SELECT
        strftime('%Y-%m', Timestamp) AS Month,
        SUM(CASE WHEN PoorSleep = 1 THEN 1 ELSE 0 END) * 100.0 / COUNT(*) AS PoorSleepPercentage,
        MAX(ScreenTime) AS MaxScreenTime,
        MIN(ScreenTime) AS MinScreenTime,
        COUNT(*) AS TotalRecords
    FROM SleepScreen
    GROUP BY Month
    ORDER BY Month;
    '''
    month_query = pd.read_sql(month_query, connection)
    connection.close()
    print("Monthly Sleep Quality and Screen Time:")
    print(month_query)
    return month_query

# I built this linear regression model to predict sleep quality from screen time
def build_prediction_model(data):
# get variables independent and dependent
        X = data[['ScreenTime']]
        y = data['SleepQuality']
# Split training and testing like we did in the previous lab
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

        model = LinearRegression()
        model.fit(X_train, y_train)
# This predicts the sleep quality and also evaluates how well the model did by using mean squared error and r2
        y_pred = model.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = model.score(X_test, y_test)
# Print the results calculated
        print(f"Mean Squared Error: {mse}")
        print(f"Regression Equation for Sleep Quality = {model.intercept_:.2f} + {model.coef_[0]:.2f} * Screen Time")
        print(f"R^2 value: {r2:.4f}")
    
        return model, X_test, y_test, y_pred
# Added more machine learning parts to strengthen the components required
# Implementing decision tree
# Gets variables, splits, predicts, and prints
def dt_model(data):
    X = data[['ScreenTime']]
    y = data['PoorSleep']
# Split training and testing like we did in the previous lab
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    dt = DecisionTreeClassifier(random_state=42)
    dt.fit(X_train, y_train)
    y_pred = dt.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    print(f"Decision Tree Accuracy: {accuracy:.2f}")

    return dt
# Another model added in (logistic regression model)
# Follows same steps as the previous two models
def logisticr_model(data):
    X = data[['ScreenTime']]
    y = data['PoorSleep']
# Split training and testing like we did in the previous lab
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    lr_model = LogisticRegression()
    lr_model.fit(X_train, y_train)

    y_pred = lr_model.predict(X_test)

    print(f"Logistic Regression:")
    print(classification_report(y_test, y_pred))

    return logisticr_model

# Wanted to make at least 10 visualizations to show the data
def create_visualizations(data, model, X_test, y_test, y_pred, time_series_data):
# 1 scatterplot with regression line
        plt.figure(figsize=(12, 5))
        plt.subplot(121)
        plt.scatter(data['ScreenTime'], data['SleepQuality'], alpha=0.6, color='blue')
        plt.plot(X_test['ScreenTime'], y_pred, color='red', linewidth=2, label='Regression Line')
        plt.title('Screen Time versus Sleep Quality')
        plt.xlabel('Screen Time (Hours)')
        plt.ylabel('Sleep Quality (1-10)')
        plt.legend()
        plt.grid(True)
# 2 actual versus predicted sleep quality
        plt.subplot(122)
        plt.scatter(y_test, y_pred, alpha=0.6, color='green')
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
        plt.title('Predicted versus Actual Sleep Quality')
        plt.xlabel('Actual Sleep Quality')
        plt.ylabel('Predicted Sleep Quality')
        plt.grid(True)
        plt.tight_layout()
        plt.show()
# 3 line plot for monthly sleep quality and screen time
        plt.figure(figsize=(10, 6))
        plt.plot(time_series_data['Month'], time_series_data['AVGSleepQuality'], marker='o', label='Avg Sleep Quality')
        plt.plot(time_series_data['Month'], time_series_data['AVGScreenTime'], marker='s', label='Avg Screen Time')
        plt.title('Monthly Sleep Quality and Screen Time Trends')
        plt.xlabel('Month')
        plt.ylabel('Value')
        plt.legend()
        plt.xticks(rotation=45)
        plt.grid(True)
        plt.tight_layout()
        plt.show()
# 4 boxplot for screen time by device category
        plt.figure(figsize=(10, 6))
        sns.boxplot(x='DeviceType', y='ScreenTime', data=data, hue='DeviceType', palette='Set3')
        plt.title('Screen Time Distribution by Device Type')
        plt.xlabel('Device Type')
        plt.ylabel('Screen Time (Hours')
        plt.show()
        
# 5 graph for average sleep quality by time ranges 0-2, 2-4, 4-6, 6-8
        bins = [0, 2, 4, 6, 8]
        labels = ['0-2 hours','2-4 hours','4-6 hours','6-8 hours']
        data['ScreenTimeRange'] = pd.cut(data['ScreenTime'], bins=bins, labels=labels, include_lowest=True)

        avg_sleep_quality = data.groupby('ScreenTimeRange', observed=False)['SleepQuality'].mean().reset_index()

        plt.figure(figsize=(10, 6))
        sns.barplot(x='ScreenTimeRange', y='SleepQuality', data=avg_sleep_quality)
        plt.title('Average Sleep Quality by Screen Time Range')
        plt.xlabel('Screen Time (Hours)')
        plt.ylabel('Average Sleep Quality') 
        plt.grid(axis='y', linestyle='--', alpha=0.5)
        plt.tight_layout()
        plt.show()
        
        
# 6 Screen time distribution graph
        plt.figure(figsize=(8, 6))
        sns.kdeplot(data=data, x='ScreenTime', hue='PoorSleep', fill=True, alpha=0.5, palette='crest')
        plt.title('Screen Time Distribution by Sleep Quality')
        plt.xlabel('Screen Time (Hours)')
        plt.ylabel('Density') 
        handles, labels = plt.gca().get_legend_handles_labels()
        plt.legend(handles, labels, title='Poor Sleep')
        plt.show()
# 7 Monthly line graph to show data for just phones
        data['Month'] = pd.to_datetime(data['Timestamp']).dt.to_period('M')
        phone_data = data[data['DeviceType']== 'Phone'].groupby('Month')[['PoorSleep', 'ScreenTime']].mean()

        plt.figure(figsize=(10, 6))
        plt.plot(phone_data.index.astype(str), phone_data['PoorSleep'], label='Poor Sleep (Phone)', linestyle='-', marker='o')
        plt.plot(phone_data.index.astype(str), phone_data['ScreenTime'], label='Screen Time (Phone)', linestyle='--', marker='x')
        plt.title('Monthly Poor Sleep and Screen Time for Phones')
        plt.xlabel('Month')
        plt.ylabel('Values')
        plt.xticks(rotation=45)
        plt.legend()
    
        plt.tight_layout()
        plt.show()
       
# 8 graph for screen time disstribution by sleep quality 
        plt.figure(figsize=(10, 6))
        g = sns.FacetGrid(data, col="PoorSleep", height=5, aspect=1.2)
        g.map(sns.histplot, "ScreenTime", element="step")
        g.set_axis_labels("Screen Time (Hours)", "Count")
        g.set_titles(col_template="{col_name}")
        g.fig.suptitle('Screen Time Distribution by Sleep Quality', y=1.02)
        plt.show()
        
# 9 graph for all devices for sleep quality  
        plt.figure(figsize=(12, 6))
        sns.boxplot(x='DeviceType', y='SleepQuality', data=data, hue='DeviceType', palette='Set2')
        sns.stripplot(x='DeviceType', y='SleepQuality', data=data)
        plt.title('Sleep Quality Distribution across Device Types')
        plt.xlabel('Device Type')
        plt.ylabel('Sleep Quality')
        plt.show()
        
# 10  sleep quality distribution graph
        plt.figure(figsize=(10, 6))
        sns.kdeplot(data=data[data['PoorSleep']==0]['SleepQuality'], label='Good Sleep', fill=True)
        sns.kdeplot(data=data[data['PoorSleep']==1]['SleepQuality'], label='Poor Sleep', fill=True)
        plt.title('Sleep Quality Distribution')
        plt.xlabel('Sleep Quality')
        plt.ylabel('Density') 
        plt.legend()
        plt.show()
        
        

#11 good or poor sleep graph
        plt.figure(figsize=(10, 6))
        sns.boxplot(x='PoorSleep', y='ScreenTime', fill=True, data=data, hue= 'PoorSleep', palette='Set3')
        plt.title('Screen Time Distribution by Sleep Quality')
        plt.xlabel('Sleep Quality (0 = Good Sleep, 1 = Poor Sleep)')
        plt.ylabel('Screen Time (Hours)') 
        plt.legend(title='Poor Sleep')
        plt.show()
        
                   
def main():
# Generate data, attempt MongoDB, perform data analysis, prediction model, decision tree, make graphs, and also make sure the function runs
        sleep_data = create_sleep_screen_data()
        setup_database(sleep_data)
        try:
            mongodb_client, db, collection = setup_mongodb(sleep_data)
        except Exception:
            pass
        analysis_data, time_series_data = perform_analysis()
        rank_devices()
        sleepq_by_month()
        trained_model, X_test, y_test, y_pred = build_prediction_model(analysis_data)
        decision_tree = dt_model(analysis_data)
        logistic_model = logisticr_model(analysis_data)
        create_visualizations(analysis_data, trained_model, X_test, y_test, y_pred, time_series_data)
        
        
if __name__ == "__main__": 
       main()