# **Data cleaning**

In [None]:
# Import necessary libraries
import pandas as pd

# Load the datasets
reaction = pd.read_csv("/content/Reactions.csv")
reaction_types = pd.read_csv("/content/ReactionTypes.csv")
content = pd.read_csv("/content/Content.csv")

In [None]:
# Drop unnecessary columns from 'content' dataframe
content.drop(['Unnamed: 0', 'User ID', 'URL'], axis=1, inplace=True)

# Drop unnecessary columns from 'reaction_types' dataframe
reaction_types.drop('Unnamed: 0',axis=1,inplace=True)

# Drop unnecessary columns from 'reaction' dataframe
reaction.drop(['Unnamed: 0','User ID'],axis=1,inplace=True)

In [None]:
# Merge 'reaction' and 'content' using a left join on 'Content ID' column.
df = reaction.merge(content, how='left', on='Content ID')

# Rename the columns in the df, changing 'Type_x' to 'Type' and 'Type_y' to 'Reaction Type'.
df = df.rename(columns={'Type_x':'Type', 'Type_y':'Reaction Type'})

# Merge the existing data frame with 'reaction_types' using a left join on the 'Type' column.
df = df.merge(reaction_types, how='left', on='Type')

# Rename the 'Type' column to 'Content Type'
df = df.rename(columns={'Type':'Content Type'})

df

Unnamed: 0,Content ID,Content Type,Datetime,Reaction Type,Category,Sentiment,Score
0,97522e57-d9ab-4bd6-97bf-c24d952602d2,,2021-04-22 15:17:15,photo,Studying,,
1,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2020-11-07 09:43:50,photo,Studying,negative,0.0
2,97522e57-d9ab-4bd6-97bf-c24d952602d2,dislike,2021-06-17 12:22:51,photo,Studying,negative,10.0
3,97522e57-d9ab-4bd6-97bf-c24d952602d2,scared,2021-04-18 05:13:58,photo,Studying,negative,15.0
4,97522e57-d9ab-4bd6-97bf-c24d952602d2,disgust,2021-01-06 19:13:01,photo,Studying,negative,0.0
...,...,...,...,...,...,...,...
25548,75d6b589-7fae-4a6d-b0d0-752845150e56,dislike,2020-06-27 09:46:48,audio,technology,negative,10.0
25549,75d6b589-7fae-4a6d-b0d0-752845150e56,intrigued,2021-02-16 17:17:02,audio,technology,positive,45.0
25550,75d6b589-7fae-4a6d-b0d0-752845150e56,interested,2020-09-12 03:54:58,audio,technology,positive,30.0
25551,75d6b589-7fae-4a6d-b0d0-752845150e56,worried,2020-11-04 20:08:31,audio,technology,negative,12.0


In [None]:
# Check for missing values in the dataframe
df.isnull().sum()

Content ID         0
Content Type     980
Datetime           0
Reaction Type      0
Category           0
Sentiment        980
Score            980
dtype: int64

In [None]:
# Create a new dataframe with missing values dropped
df_cleaned = df.dropna()

# Check for missing values in the cleaned dataframe
df_cleaned.isnull().sum()

Content ID       0
Content Type     0
Datetime         0
Reaction Type    0
Category         0
Sentiment        0
Score            0
dtype: int64

In [None]:
# Check 'Category' column
df_cleaned['Category'].unique()

array(['Studying', 'healthy eating', 'technology', 'food', 'cooking',
       'dogs', 'soccer', 'public speaking', 'science', 'tennis', 'travel',
       'fitness', 'education', 'studying', 'veganism', 'Animals',
       'animals', 'culture', '"culture"', 'Fitness', '"studying"',
       'Veganism', '"animals"', 'Travel', '"soccer"', 'Education',
       '"dogs"', 'Technology', 'Soccer', 'Culture', '"food"', 'Food',
       '"technology"', 'Healthy Eating', '"cooking"', 'Science',
       '"public speaking"', '"veganism"', 'Public Speaking', '"science"'],
      dtype=object)

In [None]:
# Clean the 'Category' column by removing double quotes and converting to lowercase
df_cleaned['Category'] = df_cleaned['Category'].replace('"', '', regex = True)
df_cleaned['Category']= df_cleaned['Category'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Category'] = df_cleaned['Category'].replace('"', '', regex = True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Category']= df_cleaned['Category'].str.lower()


In [None]:
# Display unique values in the 'Category' column
df_cleaned['Category'].unique()

array(['studying', 'healthy eating', 'technology', 'food', 'cooking',
       'dogs', 'soccer', 'public speaking', 'science', 'tennis', 'travel',
       'fitness', 'education', 'veganism', 'animals', 'culture'],
      dtype=object)

In [None]:
# Save the cleaned dataframe to a CSV file
df_cleaned.to_csv('Final data.csv', index = False)

# **Data Modeling**

In [None]:
# Group by 'Category' and count the occurrences of each 'Reaction Type'
category_reaction_counts = df_cleaned.groupby('Category').agg({"Reaction Type": "count"}).reset_index().sort_values(by='Reaction Type', ascending=False).reset_index(drop=True).head()

# Group by 'Category' and calculate the sum of 'Score' for each category
category_score_sum = df_cleaned.groupby('Category').agg({'Score': 'sum'}).reset_index().sort_values(by="Score", ascending=False).reset_index(drop=True).head()

# Display the results
print("Top Categories by Reaction Type Counts:")
print(category_reaction_counts)

print("\nTop Categories by Score Sum:")
print(category_score_sum)

Top Categories by Reaction Type Counts:
         Category  Reaction Type
0         animals           1897
1         science           1796
2  healthy eating           1717
3            food           1699
4      technology           1698

Top Categories by Score Sum:
         Category    Score
0         animals  74965.0
1         science  71168.0
2  healthy eating  69339.0
3      technology  68738.0
4            food  66676.0
