<a href="https://colab.research.google.com/github/enasshrafeldeen/movies/blob/main/wrangle_act_(6).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import requests as rq
import json


# Reading files

In [None]:
df1 = pd.read_csv("twitter-archive-enhanced (2).csv")
df1.head()

In [None]:
import pandas as pd

# Load the clean file
df2 = pd.read_csv('image-predictions (2).tsv', sep='\t')

# Preview the columns
print(df2)


In [None]:
import pandas as pd

# Load the clean file
df3 = pd.read_csv('tweet-json (1).json', sep='\t')

# Preview the columns
print(df3)


# Assessing data

In [None]:
# show the shape of our data
df1.shape

In [None]:
# explore p.1
df1.describe()
df1.info()

In [None]:
# show the shape of our data
df2.shape

In [None]:
# explore p.2
df2.describe()
df2.info()

In [None]:
# show the shape of our data
df2.shape

In [None]:
# explore p.3
df2.describe()
df2.info()

# Quality Issues – Detect

In [None]:
df_archive_clean = df1.copy()
df_image_clean = df2.copy()
df_json_clean = df3.copy()


In [None]:
# 1. Retweets present
retweets = df_archive_clean[df_archive_clean.retweeted_status_id.notnull()]

# 2. Incorrect datatypes
print(df_archive_clean.dtypes[['tweet_id', 'timestamp']])  # Should be str and datetime

# 3. Inconsistent ratings
df_archive_clean[['rating_numerator', 'rating_denominator']].describe()

# 4. Rating denominators not always 10
df_archive_clean[df_archive_clean['rating_denominator'] != 10]

# 5. Invalid dog names
invalid_names = df_archive_clean['name'].str.lower().isin(['a', 'an', 'the', 'none'])
df_archive_clean[invalid_names]

# 6. Multiple dog stages
df_archive_clean[df_archive_clean[['doggo', 'floofer', 'pupper', 'puppo']].apply(lambda x: sum(x == 'doggo') + sum(x == 'floofer') + sum(x == 'pupper') + sum(x == 'puppo'), axis=1) > 1]

# 7. Missing image predictions
missing_images = ~df_archive_clean['tweet_id'].isin(df_image_clean['tweet_id'])

# 8. Breed prediction not a dog
non_dogs = df_image_clean[df_image_clean['p1_dog'] == False]


# Tidiness Issues – Detect

In [None]:
# 1 Dog stages in multiple columns
df_archive_clean[['doggo', 'floofer', 'pupper', 'puppo']].info()

# 2 Three separate tables


# Cleaning data

In [None]:
# the numer of nulls
df1.isnull().sum()

In [None]:
# the number of nulls p.2
df2.isnull().sum()

In [None]:
# the number of nulls p.3
df3.isnull().sum()

In [None]:
# drop the nulls p.1
def remove_null_columns(df1):
    return df1.dropna(axis=1, how='all')

In [None]:
# drop the nulls p.2
def remove_null_columns(df2):
    return df2.dropna(axis=1, how='all')

In [None]:
# drop the nulls p.3
def remove_null_columns(df3):
    return df3.dropna(axis=1, how='all')

In [None]:
df1.isna().mean()
df1.info()

In [None]:
df2.isna().mean()
df2.info()

In [None]:
df3.isna().mean()
df3.info()

In [None]:
# remove retweets
def remove_retweets(df):
    return df[df['retweeted_status_id'].isnull()]

# convert data types
def convert_dtypes(df):
    df['tweet_id'] = df['tweet_id'].astype(str)  # Change tweet_id to string
    df['timestamp'] = pd.to_datetime(df['timestamp'])  # Change timestamp to datetime
    return df

# keep only tweets with rating out of 10
def standardize_ratings(df):
    return df[df['rating_denominator'] == 10]

# clean wrong dog names
def clean_dog_names(df):
    df['name'] = df['name'].replace(['a', 'an', 'the', 'None'], pd.NA)
    return df

# combine dog stages into one column
def combine_dog_stages(df):
    stages = ['doggo', 'floofer', 'pupper', 'puppo']
    df['dog_stage'] = df[stages].apply(lambda row: ','.join(row[row != 'None']), axis=1)
    df['dog_stage'] = df['dog_stage'].replace('', pd.NA)
    df = df.drop(columns=stages)
    return df

# keep only rows where the prediction is a dog
def remove_non_dog_predictions(df):
    return df[df['p1_dog'] == True]

# merge all dataframes
def merge_datasets(archive_df, image_df, json_df):
    df = archive_df.merge(image_df, on='tweet_id', how='left')
    df = df.merge(json_df, on='tweet_id', how='left')
    return df

In [None]:
import pandas as pd

# 1. Load the data files
archive = pd.read_csv('twitter-archive-enhanced (2).csv')
image = pd.read_csv('image-predictions (2).tsv', sep='\t')
tweet = pd.read_json('tweet-json (1).json', lines=True)

# 2. Remove retweets and replies
archive = archive[archive['retweeted_status_id'].isna()]
archive = archive[archive['in_reply_to_status_id'].isna()]

# 3. Drop columns we don't need
archive = archive.drop([
    'retweeted_status_id', 'retweeted_status_user_id', 'retweeted_status_timestamp',
    'in_reply_to_status_id', 'in_reply_to_user_id'
], axis=1)

# 4. Convert tweet_id to string and fix date format
archive['tweet_id'] = archive['tweet_id'].astype(str)
image['tweet_id'] = image['tweet_id'].astype(str)
tweet['tweet_id'] = tweet['id_str'].astype(str)
archive['timestamp'] = pd.to_datetime(archive['timestamp'])

# 5. Combine dog stages into one column
archive['dog_stage'] = archive[['doggo', 'floofer', 'pupper', 'puppo']].apply(
    lambda row: ','.join([stage for stage in row.index if row[stage] == stage]), axis=1)

# 6. Drop the old dog stage columns
archive = archive.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1)

# 7. Keep only tweets that have images
archive = archive[archive['tweet_id'].isin(image['tweet_id'])]

# 8. Keep only dog predictions
image = image[image['p1_dog'] == True]

# 9. Select needed columns from tweet data
tweet_clean = tweet[['tweet_id', 'retweet_count', 'favorite_count']]

# 10. Merge all data together
merge1 = pd.merge(archive, image, on='tweet_id')
final = pd.merge(merge1, tweet_clean, on='tweet_id')

# 11. Save final result
final.to_csv('twitter_archive_master.csv', index=False)
print("✅ File saved as: twitter_archive_master.csv")


# Visualisation

In [None]:
import pandas as pd

# 1. Read the files
archive = pd.read_csv('twitter-archive-enhanced (2).csv')
image = pd.read_csv('image-predictions (2).tsv', sep='\t')
tweet = pd.read_json('tweet-json (1).json', lines=True)

# 2. Clean 'archive' data
archive = archive[archive['retweeted_status_id'].isna()]
archive = archive[archive['in_reply_to_status_id'].isna()]
archive.drop(['retweeted_status_id', 'retweeted_status_user_id',
              'retweeted_status_timestamp', 'in_reply_to_status_id',
              'in_reply_to_user_id'], axis=1, inplace=True)

# 3. Format columns
archive['tweet_id'] = archive['tweet_id'].astype(str)
image['tweet_id'] = image['tweet_id'].astype(str)
tweet['tweet_id'] = tweet['id_str'].astype(str)
archive['timestamp'] = pd.to_datetime(archive['timestamp'])

# 4. Combine dog stages into one column
archive['dog_stage'] = archive[['doggo', 'floofer', 'pupper', 'puppo']].apply(
    lambda row: ','.join([stage for stage in row.index if row[stage] == stage]), axis=1)
archive.drop(['doggo', 'floofer', 'pupper', 'puppo'], axis=1, inplace=True)

# 5. Filter data
archive = archive[archive['tweet_id'].isin(image['tweet_id'])]
image = image[image['p1_dog'] == True]
tweet_clean = tweet[['tweet_id', 'retweet_count', 'favorite_count']]

# 6. Merge everything
df = pd.merge(archive, image, on='tweet_id')
df = pd.merge(df, tweet_clean, on='tweet_id')

# 7. Save the final CSV
df.to_csv('twitter_archive_master.csv', index=False)
print("✅ File saved: twitter_archive_master.csv")

In [None]:
df1.plot(kind='box', figsize=(8, 6), color={'boxes':'blue', 'whiskers':'black', 'medians':'red', 'caps':'gray'})
plt.title("Box Plot")
plt.grid(True)
plt.show()

In [None]:
df2.hist(figsize=(4,4))
plt.tight_layout()
plt.show()

In [None]:
# Create a histogram for the 'vote_average' column to visualize its distribution
df1['tweet_id'].hist()

# Set the label for the x-axis
plt.xlabel('tweet_ide')

# Set the label for the y-axis
plt.ylabel('timestamp')

# Set the title of the histogram
plt.title('relation between tweet_id and timestamp')

# Display the plot
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Assuming 'df' is your DataFrame and 'rating_numerator' is the column
plt.figure(figsize=(8, 5))
df1['rating_numerator'].hist(bins=15, edgecolor='black')
plt.title('Distribution of Dog Ratings')
plt.xlabel('Rating')
plt.ylabel('Number of Tweets')
plt.savefig('rating_distribution.png')  # 👈 Saves the figure
plt.show()


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
plt.scatter(df1['rating_denominator'], df1['rating_numerator'], alpha=0.6, color='teal')
plt.title('Rating Numerator vs Denominator')
plt.xlabel('Rating Denominator')
plt.ylabel('Rating Numerator')
plt.grid(True)
plt.savefig('rating_scatter.png')
plt.show()



In [None]:
print(df1.columns)



In [None]:
import matplotlib.pyplot as plt

# Count occurrences of each dog stage (exclude 'None')
stage_counts = df1[['doggo', 'floofer', 'pupper', 'puppo']].apply(lambda x: x[x != 'None'].count())

# Plotting
stage_counts.plot(kind='bar', color='skyblue', edgecolor='black', figsize=(8, 5))
plt.title('Distribution of Dog Stages')
plt.xlabel('Dog Stage')
plt.ylabel('Count')
plt.xticks(rotation=0)
plt.tight_layout()

# Save the figure
plt.savefig('dog_stages.png')

# Show the plot
plt.show()
