**Linear Regression**

Link for the dataset: https://www.kaggle.com/datasets/kianindeed/imdb-movie-dataset-dec-2023

This dataset contains top IMDB movies updated till 15 Dec 2023. This file in the csv fromat and it contains 11 columns namely: Moive Name, Rating, Votes, Meta Score, Genre, PG Rating, Year, Duration, Cast, Director. The data has 1950 rows

**Cleaning and modifying data**

In [80]:
# install all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics

In [81]:
# uploading dataset to juputer notebook
df = pd.read_csv("imdb_movie_data_2023.csv")

In [82]:
# read the data and see how I can modify this dataset
df.head()

# I have to google some names of columns to better understand what they mean
# Metascore is considered the rating of a film. Scores are assigned 
# to movie's reviews of large group of the world's most respected critics, 
# and weighted average are applied to summarize their opinions range.
# https://www.imdb.com/list/ls051211184/#:~:text=Metascore%20is%20considered%20the%20rating,to%20summarize%20their%20opinions%20range.

# PG rating means to what audience these movies
# if there is any restrictions for audience

Unnamed: 0.1,Unnamed: 0,Moive Name,Rating,Votes,Meta Score,Genre,PG Rating,Year,Duration,Cast,Director
0,0,Leave the World Behind,6.5,90000.0,67.0,"Drama, Mystery, Thriller",R,2023,2h 18m,"Julia Roberts, Mahershala Ali, Ethan Hawke, My...",Sam Esmail
1,1,Wonka,7.4,24000.0,66.0,"Adventure, Comedy, Family",PG,2023,1h 56m,"Timothée Chalamet, Gustave Die, Murray McArthu...",Paul King
2,2,Poor Things,8.5,6700.0,86.0,"Comedy, Drama, Romance",R,2023,2h 21m,"Emma Stone, Mark Ruffalo, Willem Dafoe, Ramy Y...",Yorgos Lanthimos
3,3,Killers of the Flower Moon,7.8,128000.0,89.0,"Crime, Drama, History",R,2023,3h 26m,"Leonardo DiCaprio, Robert De Niro, Lily Gladst...",Martin Scorsese
4,4,May December,7.0,21000.0,85.0,"Comedy, Drama",R,2023,1h 57m,"Natalie Portman, Chris Tenzis, Charles Melton,...",Todd Haynes


In [83]:
# check the format of columns
df.dtypes

# I need to check and if it is possible to modify the next columns:
# Genre, PG Rating, Duration

Unnamed: 0      int64
Moive Name     object
Rating        float64
Votes         float64
Meta Score    float64
Genre          object
PG Rating      object
Year            int64
Duration       object
Cast           object
Director       object
dtype: object

In [84]:
# I can drop columns Cast and Director
# because they containt a lot of data that cannot be modified to numeric
# the Moive Name is unnecessary for the Linear Regression
# that is why we also drop this column
df = df.drop(columns=['Cast', 'Director', 'Moive Name'])

In [85]:
# I can start work with the Genre column
# we can split the data from every row to columns
# because there is a limitation of possible genres
# and they probably have an effect on the rating of these movies

# Spliting data from the Genre column into the list
# in order to make new columns from this
# df['Genre'] = df['Genre'].str.split(',')

In [86]:
# make new columns that contain genres of movies

#df = df.join(df['Genre'].apply(pd.Series).add_prefix('Genre_'))
#df

In [87]:
#unique_values = df['Genre_1'].unique()
#unique_values

# Drama and Romance
# History and Biography
# Action and Adventure

In [88]:
# I need to duplicate the dataframe
# in order to proceed with the Genre column
duplicate_df = df.copy()

In [89]:
duplicate_df

Unnamed: 0.1,Unnamed: 0,Rating,Votes,Meta Score,Genre,PG Rating,Year,Duration
0,0,6.5,90000.0,67.0,"Drama, Mystery, Thriller",R,2023,2h 18m
1,1,7.4,24000.0,66.0,"Adventure, Comedy, Family",PG,2023,1h 56m
2,2,8.5,6700.0,86.0,"Comedy, Drama, Romance",R,2023,2h 21m
3,3,7.8,128000.0,89.0,"Crime, Drama, History",R,2023,3h 26m
4,4,7.0,21000.0,85.0,"Comedy, Drama",R,2023,1h 57m
...,...,...,...,...,...,...,...,...
1945,1945,7.1,172000.0,59.0,"Comedy, Drama, Romance",PG-13,2008,1h 52m
1946,1946,7.6,198000.0,79.0,"Drama, Horror, Sci-Fi",R,1986,1h 36m
1947,1947,6.5,71000.0,46.0,"Comedy, Drama, Family",PG,1992,1h 44m
1948,1948,6.4,30000.0,,,PG,1994,1h 47m


In [91]:
df.dropna(inplace=True)
duplicate_df.dropna(inplace=True)

In [92]:
df = pd.DataFrame(df, columns=['Unnamed', 'Genre'])

# creating list of strings where each value is one number out of topic column
unique_values = ' '.join(df['Genre'].values.tolist()).split(' ')

# creating new column for each value in unique_values
for number in unique_values:
    df[f'{number}'] = 0
    
# changing 0 to 1 for every new column where topic contains number
for idx, row in df.iterrows():
    for number in row['Genre'].split(' '):
        df.loc[idx, f'{number}'] = 1
df.drop('Genre', axis=1, inplace=True)

In [95]:
df = df.drop(columns=['Unnamed'])
df

Unnamed: 0,Drama,Mystery,Thriller,Adventure,Comedy,Family,Romance,Crime,History,Drama.1,...,Sport,War,Sci-Fi,Western,Romance.1,Music,Musical,Thriller.1,History.1,Documentary
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1944,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1945,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1946,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1947,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [96]:
# Remove commas from column names using the rename method
df.rename(columns=lambda x: x.replace(',', ''), inplace=True)

# Display the DataFrame after removing commas from column names
df

Unnamed: 0,Drama,Mystery,Thriller,Adventure,Comedy,Family,Romance,Crime,History,Drama.1,...,Sport,War,Sci-Fi,Western,Romance.1,Music,Musical,Thriller.1,History.1,Documentary
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1944,1,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1945,1,0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1946,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1947,1,0,0,0,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
# I need to combine 2 dataframes in order to proceed futher
df = pd.merge(df, duplicate_df, left_index=True, right_index=True)


In [98]:
df

Unnamed: 0.1,Drama,Mystery,Thriller,Adventure,Comedy,Family,Romance,Crime,History,Drama.1,...,History.1,Documentary,Unnamed: 0,Rating,Votes,Meta Score,Genre,PG Rating,Year,Duration
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,6.5,90000.0,67.0,"Drama, Mystery, Thriller",R,2023,2h 18m
1,0,0,0,1,1,1,0,0,0,0,...,0,0,1,7.4,24000.0,66.0,"Adventure, Comedy, Family",PG,2023,1h 56m
2,1,0,0,0,1,0,1,0,0,0,...,0,0,2,8.5,6700.0,86.0,"Comedy, Drama, Romance",R,2023,2h 21m
3,1,0,0,0,0,0,0,1,1,0,...,0,0,3,7.8,128000.0,89.0,"Crime, Drama, History",R,2023,3h 26m
4,0,0,0,0,1,0,0,0,0,1,...,0,0,4,7.0,21000.0,85.0,"Comedy, Drama",R,2023,1h 57m
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1944,1,0,0,0,0,0,1,0,0,0,...,0,0,1944,6.3,22000.0,67.0,"Drama, Musical, Romance",R,2021,2h 21m
1945,1,0,0,0,1,0,1,0,0,0,...,0,0,1945,7.1,172000.0,59.0,"Comedy, Drama, Romance",PG-13,2008,1h 52m
1946,1,0,0,0,0,0,0,0,0,0,...,0,0,1946,7.6,198000.0,79.0,"Drama, Horror, Sci-Fi",R,1986,1h 36m
1947,1,0,0,0,1,1,0,0,0,0,...,0,0,1947,6.5,71000.0,46.0,"Comedy, Drama, Family",PG,1992,1h 44m


In [99]:
# this makes multiple columns with the variable 
from sklearn.preprocessing import OneHotEncoder
variables = ['PG Rating']

# use encoder in order to make columns with only numeric data
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

In [100]:
# we can delete NaN column becuase it simply means 
# that there is no info for PG Rating
# also we can drop PG Rating_Unrated for the same reason
# and also we can delete the last column: PG Rating_X
# becuase we used OneHotEncoder
df = df.drop(columns=['PG Rating_Unrated', 'PG Rating_nan', 'PG Rating_X'])

KeyError: "['PG Rating_Unrated' 'PG Rating_nan' 'PG Rating_X'] not found in axis"

In [None]:
# we can start to modify the last column Duration
# we need to ommit "h" and "m" and modify it into minutes
# in order to have only numeric data in this column
# before converting we need to drop all nan values
df.dropna(inplace=True)

In [None]:
# we have a function that convert string column with Duration data
# into the numeric one

def convert_to_minutes(duration_str):
    try:
        # if the value is already an integer, return it as is
        if isinstance(duration_str, int):
            return duration_str

        # split the string into parts based on 'h' and 'm'
        parts = duration_str.split()

        # initialize hours and minutes
        hours, minutes = 0, 0

        # check each part and update hours or minutes accordingly
        for part in parts:
            if 'h' in part:
                hours = int(part.replace('h', ''))
            elif 'm' in part:
                minutes = int(part.replace('m', ''))

        # calculate total minutes
        total_minutes = hours * 60 + minutes
        return total_minutes
    except Exception as e:
        print(f"Error processing {duration_str}: {e}")
        return pd.NA

# Apply the conversion function to the 'Duration' column
df['Duration'] = df['Duration'].apply(convert_to_minutes)

In [None]:
# checking dataset that we finally have
df