In [7]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv('Data/project3_dataset.csv')

# Display my DataFrame
df.head()


Unnamed: 0,Age,Gender,RelationshipStatus,Occupation,Organization,SocialMedia,SocialMediaPlatforms,NumberofSocialMediaPlatforms,HoursSpent,Frequency,Distraction,Restlessness,Anxiety,ConcentrationDifficulty,SelfComparision,PostSentiment,ValidationSeeking,Depression,ActivityInterest Variance,Sleeplessness
0,21.0,Male,In a relationship,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",6,2 to 5,5,3,2,2,5,2,3,2,5,4,5
1,21.0,Female,Single,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",6,More than 5,4,3,2,5,4,5,1,1,5,4,5
2,21.0,Female,Single,University Student,University,Yes,"Facebook, Instagram, YouTube, Pinterest",4,2 to 5,2,2,1,5,4,3,3,1,4,2,5
3,21.0,Female,Single,University Student,University,Yes,"Facebook, Instagram",2,More than 5,3,2,1,5,3,5,1,2,4,3,2
4,21.0,Female,Single,University Student,University,Yes,"Facebook, Instagram, YouTube",3,2 to 5,4,5,4,5,5,3,3,3,4,4,1


In [8]:
# Check for missing values in each column
missing_values = df.isnull().any()

# Print columns with missing values
print(missing_values[missing_values == True])

Series([], dtype: bool)


In [9]:
# Rename columns
df = df.rename(columns={'Age ': 'Age', 'Occupation ': 'Occupation'})

# Now you can check the column names to verify the change
print(df.columns)

Index(['Age', 'Gender', 'RelationshipStatus', 'Occupation', 'Organization',
       'SocialMedia', 'SocialMediaPlatforms', 'NumberofSocialMediaPlatforms',
       'HoursSpent', 'Frequency', 'Distraction', 'Restlessness', 'Anxiety',
       'ConcentrationDifficulty', 'SelfComparision', 'PostSentiment',
       'ValidationSeeking', 'Depression', 'ActivityInterest Variance',
       'Sleeplessness'],
      dtype='object')


In [10]:
#List all my columns 
print(df.columns)

Index(['Age', 'Gender', 'RelationshipStatus', 'Occupation', 'Organization',
       'SocialMedia', 'SocialMediaPlatforms', 'NumberofSocialMediaPlatforms',
       'HoursSpent', 'Frequency', 'Distraction', 'Restlessness', 'Anxiety',
       'ConcentrationDifficulty', 'SelfComparision', 'PostSentiment',
       'ValidationSeeking', 'Depression', 'ActivityInterest Variance',
       'Sleeplessness'],
      dtype='object')


In [11]:
# Create a list of all unique social media platforms
all_platforms = set()
for platforms in df['SocialMediaPlatforms'].str.split(', '):
    all_platforms.update(platforms)

# Create new binary columns for each platform
for platform in all_platforms:
    df[platform] = df['SocialMediaPlatforms'].str.contains(platform).astype(int)

In [12]:
print(df.columns)

Index(['Age', 'Gender', 'RelationshipStatus', 'Occupation', 'Organization',
       'SocialMedia', 'SocialMediaPlatforms', 'NumberofSocialMediaPlatforms',
       'HoursSpent', 'Frequency', 'Distraction', 'Restlessness', 'Anxiety',
       'ConcentrationDifficulty', 'SelfComparision', 'PostSentiment',
       'ValidationSeeking', 'Depression', 'ActivityInterest Variance',
       'Sleeplessness', 'YouTube', 'Discord', 'Snapchat', 'Instagram',
       'TikTok', 'Facebook', 'Twitter', 'Reddit', 'Pinterest'],
      dtype='object')


In [13]:
df.head()

Unnamed: 0,Age,Gender,RelationshipStatus,Occupation,Organization,SocialMedia,SocialMediaPlatforms,NumberofSocialMediaPlatforms,HoursSpent,Frequency,...,Sleeplessness,YouTube,Discord,Snapchat,Instagram,TikTok,Facebook,Twitter,Reddit,Pinterest
0,21.0,Male,In a relationship,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",6,2 to 5,5,...,5,1,1,0,1,0,1,1,1,0
1,21.0,Female,Single,University Student,University,Yes,"Facebook, Twitter, Instagram, YouTube, Discord...",6,More than 5,4,...,5,1,1,0,1,0,1,1,1,0
2,21.0,Female,Single,University Student,University,Yes,"Facebook, Instagram, YouTube, Pinterest",4,2 to 5,2,...,5,1,0,0,1,0,1,0,0,1
3,21.0,Female,Single,University Student,University,Yes,"Facebook, Instagram",2,More than 5,3,...,2,0,0,0,1,0,1,0,0,0
4,21.0,Female,Single,University Student,University,Yes,"Facebook, Instagram, YouTube",3,2 to 5,4,...,1,1,0,0,1,0,1,0,0,0


In [14]:
# Drop duplicate columns
df = df.loc[:, ~df.columns.duplicated()]

# Display the updated DataFrame Columns
print(df.columns)

Index(['Age', 'Gender', 'RelationshipStatus', 'Occupation', 'Organization',
       'SocialMedia', 'SocialMediaPlatforms', 'NumberofSocialMediaPlatforms',
       'HoursSpent', 'Frequency', 'Distraction', 'Restlessness', 'Anxiety',
       'ConcentrationDifficulty', 'SelfComparision', 'PostSentiment',
       'ValidationSeeking', 'Depression', 'ActivityInterest Variance',
       'Sleeplessness', 'YouTube', 'Discord', 'Snapchat', 'Instagram',
       'TikTok', 'Facebook', 'Twitter', 'Reddit', 'Pinterest'],
      dtype='object')


In [15]:
# Split the 'HoursSpent' column 
hour_categories = ['Less than 2', '2 to 5', 'More than 5']

# Create new binary columns for each hour category
for category in hour_categories:
    df[category] = (df['HoursSpent'] == category).astype(int)

# Drop the original 'HoursSpent' column
df.drop(columns=['HoursSpent'], inplace=True)

In [16]:
# Rename columns
df = df.rename(columns={'Less than 2': 'Less than 2hrs', '2 to 5': '2 to 5 hrs', 'More than 5' : 'More than 5 hrs'})

In [17]:
print(df.columns)

Index(['Age', 'Gender', 'RelationshipStatus', 'Occupation', 'Organization',
       'SocialMedia', 'SocialMediaPlatforms', 'NumberofSocialMediaPlatforms',
       'Frequency', 'Distraction', 'Restlessness', 'Anxiety',
       'ConcentrationDifficulty', 'SelfComparision', 'PostSentiment',
       'ValidationSeeking', 'Depression', 'ActivityInterest Variance',
       'Sleeplessness', 'YouTube', 'Discord', 'Snapchat', 'Instagram',
       'TikTok', 'Facebook', 'Twitter', 'Reddit', 'Pinterest',
       'Less than 2hrs', '2 to 5 hrs', 'More than 5 hrs'],
      dtype='object')


In [19]:
df.head()

In [None]:
df.to_csv('proj3_cleaned_data.csv', index=False)