In [None]:
import pandas as pd
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import statistics
import pandas as pd
from collections import defaultdict
from collections import Counter
import numpy as np
from sklearn.model_selection import train_test_split
import plotly.graph_objects as go


In [None]:
df = pd.read_csv('C:/Users/sdole/PycharmProjects/Movie_Rating_Project/Original Dataset.csv')

# Pre-Processing

**Dropping rows where 'IMDB Score' is Null**

In [None]:
# Keeps the relevant columns
print(f'cols before: {df.columns}')
df = df[['Title','Genre','Tags', 'Languages','Series or Movie','Runtime','Director','Writer','Actors', 'IMDb Score','Release Date','Summary']]
print(f'After changing columns: {df.columns}')

# How many nulls?
print(f'How many nulls: {df.isnull().sum()}')

# How many instances?
print(f'How many instances: {df.shape[0]}')

# How many nulls at 'IMDb Score'
print(f'How many nulls in IMDb Score col: {df['IMDb Score'].isnull().sum()}')

# Dropping rows with null value at IMDb Score col
df.dropna(subset=['IMDb Score'], inplace=True)
# number of instances after removing null value at IMDb Score col
print(f'How many instances after dropping nulls in IMDb Score col: {df.shape[0]}')

**Dropping Duplicates**

In [None]:
# Duplicates
# are there dup? 
print(f'are there duplicates? {df["Title"].nunique() != len(df)}')

# 1. adding the index as a column temporarily
df.reset_index(inplace=True)
df.rename(columns={'index': 'Original Index'}, inplace=True)

# saving into a csv, duplicated rows that will be deleted
duplicates = df[df.duplicated(subset=['Title', 'Release Date', 'Series or Movie'], keep='first')]
duplicates.to_csv('deleted_duplicates.csv', index=False)

# delete duplicates in df 
data_len_before = len(df)
df.drop_duplicates(subset=['Title', 'Release Date', 'Series or Movie'], keep='first', inplace=True)
data_len_after = len(df)
print(f'Number of rows dropped: {data_len_before - data_len_after}')

# Save df with the Original Index column to a new CSV file
df.to_csv('after_dropping_dup_with_idx.csv', index=False)

# Drop the 'Original Index' column and reset the index of the DataFrame
df.drop(columns=['Original Index'], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
print(f'Final number of instances: {df.shape[0]}')

In [None]:
# Cardinality of variables 
print(f'Number of Unique Values in each column: {df.nunique()}')

In [None]:
print(df.describe())

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop('IMDb Score', axis=1)  # Features
y = df['IMDb Score']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the shape of the resulting splits
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

### Null Values

**Lower case all categorical (except of: Title, and Summary)**

In [None]:
X_train.columns

In [None]:
#  TRAIN - lower
categorical = ['Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime', 'Director', 'Writer', 'Actors']
for column in categorical:
    print(column, X_train[column].head())
    X_train[column] = X_train[column].str.lower()
    print(column, X_train[column].head())

In [None]:
# TEST - lower
categorical = ['Genre', 'Tags', 'Languages', 'Series or Movie', 'Runtime', 'Director', 'Writer', 'Actors']
for column in categorical:
    print(column, X_test[column].head())
    X_test[column] = X_test[column].str.lower()
    print(column, X_test[column].head())

In [None]:
# Number of null values within each column - train
print(f'number of nulls for each column after deleting all rows where target var (imdb rating) is null:  {X_train.isnull().sum()}')

**Genre column**

In [None]:
# How many unique values before pre process
print(X_train['Genre'].nunique())

In [None]:
# TRAIN - Sorting the Genres alphabetically
def sort_genre(genre_col):
    # if the row is none, return the row as is
    if pd.isna(genre_col):
        return genre_col
    # Split the string by comma, strip any extra spaces, and sort the list
    sorted_genres = sorted([genre.strip() for genre in genre_col.split(',')])
    # Join the sorted list back into a string
    return ', '.join(sorted_genres)


# Apply the function to the 'Genre' column
X_train['Genre'] = X_train['Genre'].apply(sort_genre)
print(X_train['Genre'].nunique())

# Calculate the percentage of occurrences of each category (combination)
num_rows = len(X_train)
genre_percentage = X_train['Genre'].value_counts(dropna=False) / num_rows * 100
genre_percentage = genre_percentage.sort_values(ascending=False)
print(genre_percentage)


top_15_genres = genre_percentage.head(15)

# Plotting the data
plt.figure(figsize=(12, 8))
ax = top_15_genres.plot(kind='bar', color='skyblue', edgecolor='white')
plt.title('Top 10 Most Frequent Genre Combinations')
plt.xlabel('Genre Combination')
plt.ylabel('Percentage')
plt.xticks(rotation=45, ha='right')

# Adding percentage labels on top of bars
for i, percentage in enumerate(top_15_genres):
    ax.text(i, percentage + 0.1, f'{percentage:.2f}%', ha='center', va='bottom')
    

In [None]:
# TEST - Sorting the Genres alphabetically# Apply the function to the 'Genre' column
X_test['Genre'] = X_test['Genre'].apply(sort_genre)
print(X_test['Genre'].nunique())


In [None]:
# PLOT - Percentage of occurrences of each unique genre

from collections import Counter
# Function to calculate genre percentages
def calculate_genre_percentages(df, column_name):
    # Extract all genres into a single list
    genres = df[column_name].dropna().str.split(',').sum()
    # Strip whitespace from each genre
    genres = [genre.strip() for genre in genres]
    # Count the occurrences of each genre
    genre_counts = Counter(genres)
    # Calculate the percentage of each genre
    total_genres = sum(genre_counts.values())
    genre_percentages = {genre: (count / total_genres) * 100 for genre, count in genre_counts.items()}
    # Convert to DataFrame for sorting
    genre_percentages_df = pd.DataFrame(list(genre_percentages.items()), columns=['Genre', 'Percentage'])
    # Sort by percentage from highest to lowest
    genre_percentages_df = genre_percentages_df.sort_values(by='Percentage', ascending=False).reset_index(drop=True)
    return genre_percentages_df

# Calculate and display the genre percentages
genre_percentages_X_train = calculate_genre_percentages(X_train, 'Genre')
print(genre_percentages_X_train)




# Plotting the data
plt.figure(figsize=(20,10))
ax = genre_percentages_X_train.head(15).plot(x = 'Genre',y='Percentage', kind='bar', color='skyblue', edgecolor='white')
plt.title('Top 10 Most Frequent Genre')
plt.xlabel('Genre Combination')
plt.ylabel('Percentage')
plt.xticks(rotation=45, ha='right')

# Adding percentage labels on top of bars
for p in ax.patches:
    ax.annotate(f'{p.get_height():.2f}%', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='baseline', 
                xytext=(0, 10), 
                textcoords='offset points')

plt.show()

steps: 
1. Pre-process 'Tags' - lower case, keep ascii only, tokenize, lemmatize, kees as list of words 
2. Create a list of unique values in the column Genre and find the most common genre 
3.  Loop through the rows where 'genre' == Null, check in 'Tags' if there is a word that matches to one of the words in the list of unique geners, if no found, copy the first word, if tags also empty then fill genre with the most common genre.
4. Delete 'Tags' column

In [None]:
# TRAIN - 1. Pre-Process Tags column 
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

exclude_chars = {'@', '"', '#', '$', '%', '&', ',', '}', '{', '(', ')', '*', '^'}
ascii_chars_to_keep = set(string.printable) - exclude_chars
print(X_train['Tags'].head())

def pre_process_tags(text):
    # if text is not a string (NAN or None)
    if not isinstance(text, str):
        return []
    tokens = word_tokenize(text)
    # Keeping only ascii chars and lower case
    processed_tokens = []
    for token in tokens:
        token = ''.join([char.lower() for char in token if char in ascii_chars_to_keep])
        token = token.strip()
        if token:
            processed_tokens.append(token)
    # lemmatize each token
    lemmatizer = WordNetLemmatizer()
    corrected_tokens = [lemmatizer.lemmatize(token) for token in processed_tokens if token]
    
    return corrected_tokens

            
X_train['Tags'] = X_train['Tags'].apply(pre_process_tags)


print(X_train['Tags'].head())

In [None]:
# TEST
print(X_test['Tags'].head())
X_test['Tags'] = X_test['Tags'].apply(pre_process_tags)
print(X_test['Tags'].head())

In [None]:
# TRAIN - 2. making a list of all unique Genres
unique_genres = set()
genre_count = {}
for genres in X_train['Genre'].dropna():
    for word in genres.split(','):
        word = word.lower().strip()
        unique_genres.add(word)
        if word in genre_count:
            genre_count[word] += 1
        else:
            genre_count[word] = 1

# Find the most common genre
genre_mode = max(genre_count, key=genre_count.get)
print(f'Most common Genre: {genre_mode}')


# 3. Function to fill missing genres based on tags and unique_genres
def fill_missing_genres(dataframe, genres_set, most_common_genre):
    # iterate over each row 
    for idx, row in dataframe.iterrows():
        # if genre is null
        if pd.isna(row['Genre']):
            # if tags is not empty list - loop for each word in genres set, if the word in tags, copy to genre null
            if row['Tags']:
                found_genre = False
                for word in genres_set:
                    if word in row['Tags']:
                        dataframe.at[idx, 'Genre'] = word
                        found_genre = True
                        break
                if not found_genre:
                    dataframe.at[idx, 'Genre'] = row['Tags'][0]
            # if tags is also empty, fill null with common genre
            else:
                dataframe.at[idx, 'Genre'] = most_common_genre
    # Drop the 'Tags' column after filling in the genres
    dataframe.drop(columns=['Tags'], inplace=True)
    return dataframe

# Call the function
X_train = fill_missing_genres(X_train, unique_genres, genre_mode)

In [None]:
# TEST
X_test = fill_missing_genres(X_test, unique_genres, genre_mode)

In [None]:
# check now if all good 
print(X_train.isnull().sum())

**Language**


steps: 
1. Pre-process language - for each row split the value into a list and save only the first language
2. find mode in column - find the mode language ('language_mode')
3. Create actor language dict - returns a dictionary, each key is an actor, and each value is the most common language of that actor 
4. filling nulls language - gets a dictionary and a language mode. Fill null by the dictionary, if key not found, complete by the mode.

In [None]:
# 1. LANGUAGE COL WITH 1 VAL ONLY: gets a column language, and keeps the first value in each row 
def pre_process_language(text_language):
    # if the value is not nun or none then do that. else, skip 
    if isinstance(text_language, str):
        language_list = text_language.split(',')
        return language_list[0].strip()
    else: 
        return text_language

X_train['Languages'] = X_train['Languages'].apply(pre_process_language)

# 2. FIND LANGUAGE MODE: calculates the most common val in language (mode)
def find_mode_in_col(col):
    mode_result = col.mode()
    if not mode_result.empty:
        return mode_result.iloc[0]
    else:
        return np.nan

language_mode = find_mode_in_col(X_train['Languages'])
print(language_mode)


# 3. CREATE A DICT: {'CHRIS': ENGLISH, SPANISH, ETC.}
# Populate the dictionary with known first languages for each actor
# Iterate through all rows in the DataFrame
def create_actor_language_dict(dataframe):
    # Create a dictionary to hold the first languages for each actor
    dict_actors = {}
    for index, row in dataframe.iterrows():
        if isinstance(row['Languages'], str):   # Check if the 'Languages' field is not NaN and not 'null'
            # Split the actors and languages and trim whitespace from each element
            actors = [actor.strip() for actor in ([] if type(row['Actors'])!=str else row['Actors'].split(','))]  # Trim whitespace from each actor
            # Get the first language (only one language per row now)
            first_language = row['Languages'].strip()
    
            if actors and first_language:
              # for each actor in this row 
              for actor in actors:
                actor = actor.strip()
                if actor not in dict_actors:
                    dict_actors[actor] = [first_language]
                else: 
                    dict_actors[actor].append(first_language)
    # updating the dictionary to store the mode language for each key (actor) 
    for actor, languages in dict_actors.items():
            mode_language_per_actor  = statistics.mode(languages)
            dict_actors[actor] = mode_language_per_actor
    
    return dict_actors
# calling the function 
actor_language_dict = create_actor_language_dict(X_train)     


# 4. filling in nulls of language rows with the actors dictionary
def filling_nulls_language(dataframe, actor_language_dict):
    for idx, row in dataframe.iterrows():
        # for each null row in language
        if pd.isna(row['Languages']):
            # if actor is not null 
            if isinstance(row['Actors'], str):
                # stor the actors name 
                actor_key = row['Actors'].split(',')[0].strip().lower()
                # if the actors name is a key in dict then fill language with the value of the key 
                if actor_key in actor_language_dict:
                    dataframe.at[idx, 'Languages'] = actor_language_dict[actor_key]
                else: 
                    dataframe.at[idx, 'Languages'] = language_mode
            else: 
                dataframe.at[idx, 'Languages'] = language_mode

filling_nulls_language(X_train, actor_language_dict)


In [None]:
# TEST
# 1. keeps the first language in each row 
X_test['Languages'] = X_test['Languages'].apply(pre_process_language)

# 2. filling nulls with the actor dictionary that we made in train
filling_nulls_language(X_test, actor_language_dict)

In [None]:
X_train.isnull().sum()

In [None]:
X_test.isnull().sum()

**director, writer, actors, release date - Random sampling imputation method**

steps: 
1. pre-process_category - gets a value in a column. if the value is not null, split the value into a list of words, by a comma. i.e 'chris brown, danna fox' returns 'chris brown'
2. fill null imputation - gets a column and a dataframe and filling nulls while keeping the distribution of the variable

In [None]:
# 1. pre-process category by keeping the first value in each row
def pre_process_category(value):
    # if the value is not nun or none then do that. else, skip 
    if isinstance(value, str):
        # lower 
        value.lower()
        # convert into a list of writers
        list_val = value.split(',')
        return list_val[0].strip()
    else: 
        return value

In [None]:
# 2. random sampling imputation- filling nulls 
def fill_null_with_category(dataframe, column):
    # Get the proportion of each category in the column
    value_counts = dataframe[column].value_counts(normalize=True)
    # Generate a list with missing values in the specified column
    null_indices = dataframe.index[dataframe[column].isnull()].tolist()
    # Fill in missing values with the proportion of each category
    dataframe.loc[null_indices, column] = np.random.choice(value_counts.index, size=len(null_indices), p=value_counts.values)
    return value_counts

In [None]:
# Director
X_train['Director'] = X_train['Director'].apply(pre_process_category)
Director_distribution = fill_null_with_category(X_train, 'Director')
X_train['Director'].isnull().sum()

In [None]:
# Writer
X_train['Writer'] = X_train['Writer'].apply(pre_process_category)
Writer_distribution = fill_null_with_category(X_train, 'Writer')
X_train['Writer'].isnull().sum()

In [None]:
# actors 
X_train['Actors'] = X_train['Actors'].apply(pre_process_category)
Actors_distribution = fill_null_with_category(X_train, 'Actors')
X_train['Actors'].isnull().sum()

In [None]:
# release date
X_train['Release Date'] = X_train['Release Date'].apply(pre_process_category)
Release_Date_distribution = fill_null_with_category(X_train, 'Release Date')
X_train['Release Date'].isnull().sum()

In [None]:
X_train.isnull().sum()

In [None]:
# How many unique values each column has now? 
column_info = []
for col in X_train.columns:
    col_name = col 
    col_unique = X_train[col].nunique()
    
        # adding the columns 
    column_info.append({
        'Column Name': col_name,
        'Unique Values': col_unique,
    })


column_info_df = pd.DataFrame(column_info)

# plot the table 
fig = go.Figure(data=[go.Table(
    header=dict(values=list(column_info_df.columns),
                fill_color= '#636EFA',
                align='left',
                font=dict(color='black', size=15)),
    cells=dict(values=[column_info_df['Column Name'], column_info_df['Unique Values']],
               fill_color='lavender',
               align='left',
               height=25,
               font=dict(color='black', size=13)))
])

# Update layout
fig.update_layout(template='plotly_white', width=550, height= 550)

# Show the table
fig.show()


In [None]:
# TEST 
# New function to use the value counts for the test set
def fill_null_with_category_test_set(dataframe, column, value_counts):
    # Generate a list with missing values in the specified column
    null_indices = dataframe.index[dataframe[column].isnull()].tolist()
    # Fill in missing values with the proportion of each category from the training set
    dataframe.loc[null_indices, column] = np.random.choice(value_counts.index, size=len(null_indices), p=value_counts.values)

In [None]:
# director
X_test['Director'] = X_test['Director'].apply(pre_process_category)
fill_null_with_category_test_set(X_test, 'Director', Director_distribution)
X_test['Director'].isnull().sum()

# Writer
X_test['Writer'] = X_test['Writer'].apply(pre_process_category)
fill_null_with_category_test_set(X_test, 'Writer', Writer_distribution)
X_test['Writer'].isnull().sum()

# actors 
X_test['Actors'] = X_test['Actors'].apply(pre_process_category)
fill_null_with_category_test_set(X_test, 'Actors', Actors_distribution)
X_test['Actors'].isnull().sum()

# release date
X_test['Release Date'] = X_test['Release Date'].apply(pre_process_category)
fill_null_with_category_test_set(X_test, 'Release Date', Release_Date_distribution)
X_test['Release Date'].isnull().sum()

In [None]:
X_test.isnull().sum()

### Feature Representation

1. the function gest a dataframe, a column to encode, and a set of unique values within the column 
2. first we iterate over the set, and create a new column with the name of each value 
3. then we loop through the rows. for each row we insert the values within the row into a list 
4. for each value in this list if the value in the set of unique values, I want to put 1 in the corresponding column 

In [None]:
def feature_representation_one_hot(dataframe, column_to_encode, unique_values_set):
    """
    Perform one-hot encoding on a specified column in the dataframe based on the unique values provided.
    
    Args:
    dataframe (pd.DataFrame): The dataframe to transform.
    column (str): The name of the column to one-hot encode.
    unique_values (set): The set of unique values to create binary columns for.
    
    Returns:
    pd.DataFrame: The transformed dataframe with one-hot encoded columns.
    """
    # Initialize binary columns for each unique value
    for value in unique_values_set:
        column_name = f"{column_to_encode}_{value}"
        dataframe[column_name] = 0
    
    for idx, row in dataframe.iterrows():
        # Get the value(s) in the specified column
        values_list = row[column_to_encode].lower().split(',')
        # For each value, set the corresponding binary column to 1
        for value in values_list:
            if value in unique_values_set:
                column_name = f"{column_to_encode}_{value}"
                dataframe.at[idx, column_name] = 1
    
    dataframe.drop(columns=[column_to_encode], inplace=True)
    return dataframe

**Genre**

In [None]:
print(X_train.columns)
X_train = feature_representation_one_hot(X_train, 'Genre', unique_genres)
print(X_train.columns)

In [None]:
# TEST 
print(X_test.columns)
X_test = feature_representation_one_hot(X_test, 'Genre', unique_genres)
print(X_test.columns)
print(X_test.isnull().sum())

In [None]:
print(X_train.shape)
print(y_train.shape)

**Director, Writer, Actors**

taking the top 15, the rest is 'other' 

**Director**

In [None]:
director_counts = X_train['Director'].value_counts()
total_rows = len(X_train)
director_percentage = ((director_counts / total_rows) * 100).sort_values(ascending=False)
director_percentage_set = set(director_percentage.index)
print(director_percentage)

top_directors = director_percentage.head(30)
plt.figure(figsize=(12, 8))
bars = plt.bar(top_directors.index, top_directors.values, color='orange')

# Adding the percentage labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, f'{yval:.2f}%', ha='center', va='bottom')

plt.xlabel('Director')
plt.ylabel('Percentage')
plt.title('Top Director by Percentage')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('Director distribution.png', facecolor='white', edgecolor='white')
# Display the plot
plt.show()

In [None]:
# convert every value in director column, to other, if it is not in set
top_15_directors_set = set(director_percentage.head(15).index)

def replace_non_top_director(dataframe, top_director_set):
    for idx, row in dataframe.iterrows():
        if row['Director'] not in top_director_set:
            dataframe.at[idx, 'Director'] = "Other"
    return dataframe

X_train = replace_non_top_director(X_train, top_15_directors_set)

In [None]:
# Check that there are 16 unique values
X_train['Director'].nunique()

In [None]:
# Create the set of unique directors
directors_set = set(X_train['Director'].unique()) - {'Other'}
directors_set

In [None]:
X_train = X_train.copy()
X_test = X_test.copy()
print(X_train['Director'])

In [None]:
print(X_test['Director'])

In [None]:
# TRAIN 
print(X_train.columns)
X_train = feature_representation_one_hot(X_train, 'Director', directors_set)
print(X_train.columns)
print(X_train.isnull().sum())

In [None]:
print(X_test['Director'])

In [None]:
# TEST
print(X_test.columns)
X_test = feature_representation_one_hot(X_test, 'Director', directors_set)
print(X_test.columns)
print(X_test.isnull().sum())
print(len(X_test.columns))

In [None]:
print(len(X_test.columns))


**Writer**

In [None]:
# calculate the percentage of each writer 
writer_counts = X_train['Writer'].value_counts()
total_rows = len(X_train)
writer_percentage = ((writer_counts / total_rows) * 100).sort_values(ascending=False)
print(writer_percentage)
top_15_writers = writer_percentage.head(25)
plt.figure(figsize=(12, 8))
bars = plt.bar(top_15_writers.index, top_15_writers.values, color='orange')

# Adding the percentage labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, f'{yval:.2f}%', ha='center', va='bottom')

plt.xlabel('Writers')
plt.ylabel('Percentage')
plt.title('Top 15 Writers by Percentage')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# convert every value in writer column, to other, if it is not in set
top_15_writers_set = set(writer_percentage.head(15).index)
type(top_15_writers_set)

def replace_non_top_writers(dataframe, top_writers_set):
    for idx, row in dataframe.iterrows():
        if row['Writer'] not in top_writers_set:
            dataframe.at[idx, 'Writer'] = "Other"
    return dataframe

X_train = replace_non_top_writers(X_train, top_15_writers_set)

In [None]:
# Create the set of unique directors
Writers_set = set(X_train['Writer'].unique()) - {'Other'}
Writers_set

In [None]:
# TRAIN 
print(X_train.columns)
X_train = feature_representation_one_hot(X_train, 'Writer', Writers_set)
print(X_train.columns)
print(X_train.isnull().sum())

In [None]:
# TEST
print(X_test.columns)
X_test = feature_representation_one_hot(X_test, 'Writer', Writers_set)
print(X_test.columns)
print(X_test.isnull().sum())
print(len(X_test.columns))

In [None]:
len(Writers_set)

**Actors**

In [None]:
actors_counts = X_train['Actors'].value_counts()
total_rows = len(X_train)
actors_percentage = ((actors_counts / total_rows) * 100).sort_values(ascending=False)
actors_percentage_set = set(actors_percentage.index)
print(actors_percentage)

top_actors = actors_percentage.head(45)
plt.figure(figsize=(12, 8))
bars = plt.bar(top_actors.index, top_actors.values, color='orange')

# Adding the percentage labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, f'{yval:.2f}%', ha='center', va='bottom')

plt.xlabel('Director')
plt.ylabel('Percentage')
plt.title('Top Director by Percentage')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()

# Display the plot
plt.show()

In [None]:
# convert every value in director column, to other, if it is not in set
top_15_actors_set = set(actors_percentage.head(15).index)
type(top_15_writers_set)

def replace_non_top_director(dataframe, top_actors_set):
    for idx, row in dataframe.iterrows():
        if row['Actors'] not in top_actors_set:
            dataframe.at[idx, 'Actors'] = "Other"
    return dataframe

X_train = replace_non_top_director(X_train, top_15_actors_set)

In [None]:
# Create the set of unique directors
actor_set = set(X_train['Actors'].unique()) - {'Other'}
actor_set

In [None]:
# TRAIN 
print(X_train.columns)
X_train = feature_representation_one_hot(X_train, 'Actors', actor_set)
print(X_train.columns)
print(X_train.isnull().sum())

In [None]:
print(X_test['Actors'])

In [None]:
# TRAIN 
print(X_test.columns)
X_test = feature_representation_one_hot(X_test, 'Actors', actor_set)
print(X_test.columns)
print(X_test.isnull().sum())

**Language**

In [None]:
language_count = X_train['Languages'].value_counts()
total_rows = len(X_train)
Language_percentage_sorted = ((language_count / total_rows) * 100).sort_values(ascending=False)
Language_percentage_sorted_set = set(Language_percentage_sorted.index)
print(Language_percentage_sorted)

top_languages = Language_percentage_sorted.head(20)
plt.figure(figsize=(12, 8))
bars = plt.bar(top_languages.index, top_languages.values, color='blue')

# Adding the percentage labels on top of each bar
for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.005, f'{yval:.2f}%', ha='center', va='bottom')

plt.xlabel('Languages')
plt.ylabel('Percentage')
plt.title('Top Languages by Percentage')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.savefig('Language distribution.png', facecolor='white', edgecolor='white')
# Display the plot
plt.show()

In [None]:
Language_percentage_sorted_set

In [None]:
# convert every value in director column, to other, if it is not in set
top_15_language_set = set(Language_percentage_sorted.head(15).index)
type(top_15_language_set)

def replace_non_top_language(dataframe, top_language_set):
    for idx, row in dataframe.iterrows():
        if row['Languages'] not in top_language_set:
            dataframe.at[idx, 'Languages'] = "Other"
    return dataframe

X_train = replace_non_top_language(X_train, top_15_language_set)

In [None]:
# Create the set of unique directors
Languages_set = set(X_train['Languages'].unique()) - {'Other'}
Languages_set

In [None]:
# TRAIN 
print(X_train.columns)
X_train = feature_representation_one_hot(X_train, 'Languages', Languages_set)
print(X_train.columns)
print(X_train.isnull().sum())

In [None]:
# TRAIN 
print(X_test.columns)
X_test = feature_representation_one_hot(X_test, 'Languages', Languages_set)
print(X_test.columns)
print(X_test.isnull().sum())

**Runtime**

In [None]:
# TRAIN
# Represents as an ordinal variable

runtime_order = ['< 30 minutes', '30-60 mins', '1-2 hour', '> 2 hrs']
runtime_mapping = {runtime: index for index, runtime in enumerate(runtime_order)}


# Apply the mapping to the Runtime column
X_train['Runtime'] = X_train['Runtime'].map(runtime_mapping)

print(X_train['Runtime'].nunique())
print(X_train['Runtime'].unique())

In [None]:
# TEST
# Apply the same mapping to the Runtime column in the test set
X_test['Runtime'] = X_test['Runtime'].map(runtime_mapping)
print(X_test['Runtime'].nunique())
print(X_test['Runtime'].unique())

**release date**

In [None]:
# TRAIN
# Convert to date type
X_train['Release Date'] = pd.to_datetime(X_train['Release Date'])

# Extract year, month, and day using the dt accessor
X_train['Released_Year'] = X_train['Release Date'].dt.year
X_train['Released_Month'] = X_train['Release Date'].dt.month
# X_train['Released_Day'] = X_train['Release Date'].dt.day

# Drop the 'Release Date' column
X_train.drop(columns=['Release Date'], inplace=True)

# print columns to check
X_train.columns

In [None]:
# check datatype of releases  
print(X_train[['Released_Year', 'Released_Month']].dtypes)

In [None]:
# TEST
# Convert to date type
X_test['Release Date'] = pd.to_datetime(X_test['Release Date'])

# Extract year, month, and day using the dt accessor
X_test['Released_Year'] = X_test['Release Date'].dt.year
X_test['Released_Month'] = X_test['Release Date'].dt.month
# X_test['Released_Day'] = X_test['Release Date'].dt.day

# Drop the 'Release Date' column
X_test.drop(columns=['Release Date'], inplace=True)

# print columns to check
X_test.columns

**Series or Movie**

In [None]:
# TRAIN
# Gets a dataframe, column to encode and positive (which category will encode as 1) 
def binary_encode_column(dataframe, column_to_encode, positive_value):
    # Create a binary column indicating whether the value is the positive value (e.g., 'movie')
    dataframe[column_to_encode + '_' + positive_value] = (dataframe[column_to_encode] == positive_value).astype(int)
    
    # Drop the original column
    dataframe = dataframe.drop(column_to_encode, axis=1)
    
    return dataframe

# Example usage:
# Assuming df is your DataFrame
X_train = binary_encode_column(X_train, 'Series or Movie', 'movie')

In [None]:
X_train["Series or Movie_movie"]

In [None]:
# TEST
X_test = binary_encode_column(X_test, 'Series or Movie', 'movie')

In [None]:
print(X_test.columns)

## Feature Extraction

**Summary - Count Words**

In [None]:
import re


#text = df['Summary']
def word_count(text):
 if isinstance(text, str):  # Check if the value is a string
   text = text.replace('-', ' ')
   cleaned_text = re.sub(r'[^\w\s]', '', text)
   return len(cleaned_text.split())
 else:
     return 0  # Return 0 for non-string values


# Apply the word_count function to the 'Summary' column
X_train['Summary_length'] = X_train['Summary'].apply(word_count)


#df.head(10)
# cheking the data type 
X_train['Summary_length'].dtype

In [None]:
# TEST 
X_test['Summary_length'] = X_test['Summary'].apply(word_count)


In [None]:
print(X_train.columns)
print(X_test.columns)
print(len(X_test.columns)==len(X_train.columns))

**Summary - Sentiment Analysis**

not removing stop words because they are meaningful in sentiment analysis - for example: not, is a stop word, and we dont remove punctuation marks because they are in the Vadar lexicon 

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# print the vocabulary 
analyzer.lexicon

In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer

analyzer = SentimentIntensityAnalyzer()

# Gets a text, extract pos, neg, neu scores 
def sentiment_scores(text):
    text.lower()
    scores = analyzer.polarity_scores(str(text))
    return pd.Series([scores['pos'], scores['neg'], scores['neu']])


X_train[['positive', 'negative', 'neutral']] = X_train['Summary'].apply(sentiment_scores)
print(X_train[['positive', 'negative', 'neutral']].head())

In [None]:
# check that the columns where added 
X_train.columns 

In [None]:
# print an example of a sentence and its scores 
example = X_train[['Summary','positive', 'negative', 'neutral']].head(5)
example

In [None]:
# TEST
X_test[['positive', 'negative', 'neutral']] = X_test['Summary'].apply(sentiment_scores)


In [None]:
print(X_train.columns)
print(X_test.columns)
print(len(X_test.columns)==len(X_train.columns))

**Dropping Title and Summary columns**

In [None]:
# TRAIN 
X_train.drop('Title', axis=1, inplace=True)
X_train.drop('Summary', axis=1, inplace=True)

In [None]:
# TEST 
X_test.drop('Title', axis=1, inplace=True)
X_test.drop('Summary', axis=1, inplace=True)

In [None]:
print(X_train.shape)
print(y_train.shape)

**Checking dataset columns**

In [None]:
print(X_train.columns)

**Saving dataset as csv**

In [None]:
print(type(X_train))
print(type(y_train))

In [None]:
# saving X train and y train as train csv 
train_set = pd.concat([X_train, y_train], axis=1)
train_set.to_csv('train_set.csv', index=False)


# saving X test and Y test as test csv 
test_set = pd.concat([X_test, y_test], axis=1)
test_set.to_csv('test_set.csv', index=False)

In [None]:
# loading the CSV files and get the X train, Y train, X test, Y test 
import pandas as pd

def load_and_split_data(train_file: str, test_file: str, target_column: str):
    # Load the train and test sets from CSV
    train_set = pd.read_csv(train_file)
    test_set = pd.read_csv(test_file)

    # Split into features and target
    X_train = train_set.drop(columns=[target_column])
    y_train = train_set[target_column]

    X_test = test_set.drop(columns=[target_column])
    y_test = test_set[target_column]

    return X_train, y_train, X_test, y_test

# Example usage:
X_train, y_train, X_test, y_test = load_and_split_data('C:/Users/sdole/PycharmProjects/Movie_Rating_Project/train_tests_datasets/train_set.csv', "C:/Users/sdole/PycharmProjects/Movie_Rating_Project/train_tests_datasets/test_set.csv", target_column='IMDb Score')

In [None]:
y_train.shape

In [None]:
X_train.shape

# Model Training

In [None]:
from sklearn.tree import DecisionTreeRegressor, plot_tree
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error as MSE
from sklearn.ensemble import RandomForestRegressor

### Default - Decision tree

In [None]:
# train a default tree 
default_tree = DecisionTreeRegressor(random_state=42)
print(default_tree.get_params())
default_tree.fit(X_train, y_train)

# learned parameters value
print(f"Tree Depth: {default_tree.get_depth()}")
# Check the minimum number of samples in any leaf node
print(f"Minimum Samples in Leaf Nodes: {default_tree.get_n_leaves()}")

y_train_pred = default_tree.predict(X_train)
train_MSE = MSE(y_train,y_train_pred)
train_RMSE = train_MSE**(1/2)
print(f'MSE on train set: {train_MSE}')
print(f'RMSE on train set: {train_RMSE}')

y_test_pred = default_tree.predict(X_test)
test_MSE = MSE(y_test,y_test_pred)
test_RMSE = test_MSE**(1/2)
print(f'train MSE on test set: {test_MSE}')
print(f'RMSE on test set: {test_RMSE}')

In [None]:
# Access the tree structure
tree = default_tree.tree_

# Identify leaf nodes
is_leaf = tree.children_left == -1
print(is_leaf)
# Get the number of samples in each leaf node
samples_in_leaves = tree.n_node_samples[is_leaf]
print(samples_in_leaves)
# Calculate the mean number of samples per leaf
mean_samples_per_leaf = np.mean(samples_in_leaves)

print(f"Mean number of samples per leaf: {mean_samples_per_leaf:.2f}")

In [None]:
# split only for plotting 
x_train_split, x_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

res = []
samples_in_leaf = range(1, 100)



for samples in samples_in_leaf:
    print(f"Evaluating min_samples_leaf={samples}")
    model = DecisionTreeRegressor(random_state=42, min_samples_leaf=samples)
    model.fit(x_train_split, y_train_split)
    
    # Predict on training data
    y_train_pred = model.predict(x_train_split)
    train_MSE = MSE(y_train_split,y_train_pred)
    train_RMSE = train_MSE**(1/2)
    
    # Predict on validation data
    y_val_pred = model.predict(x_val)
    val_MSE = MSE(y_val,y_val_pred)
    val_RMSE = val_MSE**(1/2)

    res.append({
        'samples_in_a_leaf': samples, 
        'train_rmse': train_RMSE, 
        'val_rmse': val_RMSE
    })

# Convert the list of results to a DataFrame
res = pd.DataFrame(res)

# Plotting the results
plt.figure(figsize=(13, 4))
plt.plot(res['samples_in_a_leaf'], res['train_rmse'], marker='o', markersize=4)
plt.plot(res['samples_in_a_leaf'], res['val_rmse'], marker='o', markersize=4)
plt.legend(['Train RMSE', 'Validation RMSE'])
plt.xlabel('Min Samples in a Leaf')
plt.ylabel('RMSE')
plt.title('RMSE by Min samples leaf - Decision tree')
plt.savefig('Min_Samples_in_a_Leaf.png', facecolor='white', edgecolor='white')
plt.show()

In [None]:
res = []
max_depth = range(1,52)



for depth in max_depth:
    print(f"Evaluating min_samples_leaf={depth}")
    model = DecisionTreeRegressor(random_state=42, max_depth=depth)
    model.fit(x_train_split, y_train_split)
    
    # Predict on training data
    y_train_pred = model.predict(x_train_split)
    train_MSE = MSE(y_train_split,y_train_pred)
    train_RMSE = train_MSE**(1/2)
    
    # Predict on validation data
    y_val_pred = model.predict(x_val)
    val_MSE = MSE(y_val,y_val_pred)
    val_RMSE = val_MSE**(1/2)

    res.append({
        'depth': depth, 
        'train_rmse': train_RMSE, 
        'val_rmse': val_RMSE
    })

# Convert the list of results to a DataFrame
res = pd.DataFrame(res)

# Plotting the results
plt.figure(figsize=(13, 4))
plt.plot(res['depth'], res['train_rmse'], marker='o', markersize=4)
plt.plot(res['depth'], res['val_rmse'], marker='o', markersize=4)
plt.legend(['Train RMSE', 'Validation RMSE'])
plt.xlabel('Depth')
plt.ylabel('RMSE')
plt.title('RMSE by Max depth - Decision tree')
plt.savefig('max_depth_decision_tree.png', facecolor='white', edgecolor='white')
plt.show()

## Hyperparameter tuning - Decision tree

In [None]:
# Decision tree 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tree_model = DecisionTreeRegressor(random_state=42)
param_grid = {
    'min_samples_leaf': np.arange(10,60),
    'max_depth': list(np.arange(10, 40)) + [None],
}

rand_search = RandomizedSearchCV(estimator=tree_model, param_distributions=param_grid, scoring='neg_mean_squared_error' , cv=kf, n_jobs=-1, refit=True, return_train_score=True, n_iter=600, random_state=42)
rand_search.fit(X_train, y_train)

# Best model and its hyperparameters
best_model = rand_search.best_estimator_
best_hyperparams = rand_search.best_params_
print("Best hyperparameters:", best_hyperparams)

# Predictions on the training set
y_train_pred = best_model.predict(X_train)
train_mse = MSE(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
print(f"Train set MSE: {train_mse:.3f}, RMSE: {train_rmse:.3f}")

# Predictions on the test set
y_test_pred = best_model.predict(X_test)
test_mse = MSE(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
print(f"Test set MSE: {test_mse:.3f}, RMSE: {test_rmse:.3f}")


# cv_results_
cv_results = pd.DataFrame(rand_search.cv_results_)
print(cv_results.columns)

In [None]:

# CV results
cv_results = pd.DataFrame(rand_search.cv_results_)
selected_columns = ['params', 'mean_train_score', 'mean_test_score', 'std_test_score', 'rank_test_score']
df_selected = cv_results[selected_columns]

# Display the CV results DataFrame
print(df_selected)

In [None]:
import matplotlib.pyplot as plt

# Adjusted plot_configurations function
def plot_configurations(result_df, is_best, model_name):
    if is_best:
        best_10_df = result_df.sort_values(by='rank_test_score', ascending=True)
        dataframe_to_plot = best_10_df.head(10)
        filename = f'{model_name} - top 10 configurations.png'
    else: 
        worst_10_df = result_df.sort_values(by='rank_test_score', ascending=False)
        dataframe_to_plot = worst_10_df.head(10)
        filename = f'{model_name} - worst 10 configurations.png'
    
    fig, ax = plt.subplots(figsize=(18, 4))
    
    ax.axis('tight')
    ax.axis('off')
    table = ax.table(cellText=dataframe_to_plot.values, colLabels=dataframe_to_plot.columns, cellLoc='center', loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(16) 
    
    table.auto_set_column_width([0, 1, 2]) 
    
    # Set cell size
    for key, cell in table.get_celld().items():
        cell.set_width(0.35) 
        cell.set_height(0.15)
    
    # Save the table as an image
    plt.savefig(filename, bbox_inches='tight', pad_inches=0.5)
    plt.show()

# Use the adjusted function to plot the 10 best and 10 worst configurations
plot_configurations(result_df=df_selected, is_best=True, model_name='Decision Tree')
plot_configurations(result_df=df_selected, is_best=False, model_name='Decision Tree')


In [None]:
# Get feature importance
feature_importance = best_model.feature_importances_

# dataframe of feature importance
feature_names = X_train.columns 
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df = importance_df[importance_df['Importance']>0]
# Plot the feature importance
plt.figure(figsize=(14, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis() 
plt.xlabel('Importance')
plt.title('Feature Importance - Decision tree', fontweight='bold')
plt.savefig('Feature Importance- Decision Tree.png', facecolor='white', edgecolor='white')
plt.show()

### Default - Randon forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

# default model 
default_tree = RandomForestRegressor(random_state=42, n_jobs=-1)


print(default_tree.get_params())
default_tree.fit(X_train, y_train)

y_train_pred = default_tree.predict(X_train)
train_MSE = MSE(y_train,y_train_pred)
train_RMSE = train_MSE**(1/2)
print(f'MSE on train set: {train_MSE}')
print(f'RMSE on train set: {train_RMSE}')

y_test_pred = default_tree.predict(X_test)
test_MSE = MSE(y_test,y_test_pred)
test_RMSE = test_MSE**(1/2)
print(f'train MSE on test set: {test_MSE}')
print(f'RMSE on test set: {test_RMSE}')


# finding parameters values in default random forest
num_trees = len(default_tree.estimators_)
max_depths = []
samples_per_leaf = []

for tree in default_tree.estimators_:
    max_depths.append(tree.tree_.max_depth)
    
    n_node_samples = tree.tree_.n_node_samples
    is_leaf = tree.tree_.children_left == -1  
    leaf_samples = n_node_samples[is_leaf]
    samples_per_leaf.extend(leaf_samples)
    
print(f"Average maximum depth: {sum(max_depths) / num_trees}")
print(f"Mean samples per leaf: {np.mean(samples_per_leaf)}")

In [None]:
# split only for plotting 
x_train_split, x_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

res = []
samples_in_leaf = range(1, 100)



for samples in samples_in_leaf:
    print(f"Evaluating min_samples_leaf={samples}")
    model = RandomForestRegressor(random_state=42, min_samples_leaf=samples)
    model.fit(x_train_split, y_train_split)
    
    # Predict on training data
    y_train_pred = model.predict(x_train_split)
    train_MSE = MSE(y_train_split,y_train_pred)
    train_RMSE = train_MSE**(1/2)
    
    # Predict on validation data
    y_val_pred = model.predict(x_val)
    val_MSE = MSE(y_val,y_val_pred)
    val_RMSE = val_MSE**(1/2)

    res.append({
        'samples_in_a_leaf': samples, 
        'train_rmse': train_RMSE, 
        'val_rmse': val_RMSE
    })

# Convert the list of results to a DataFrame
res = pd.DataFrame(res)

# Plotting the results
plt.figure(figsize=(13, 4))
plt.plot(res['samples_in_a_leaf'], res['train_rmse'], marker='o', markersize=4)
plt.plot(res['samples_in_a_leaf'], res['val_rmse'], marker='o', markersize=4)
plt.legend(['Train RMSE', 'Validation RMSE'])
plt.xlabel('Min Samples in a Leaf')
plt.ylabel('RMSE')
plt.title('RMSE by Min samples leaf - Decision tree')
plt.savefig('Min_Samples_in_a_Leaf.png', facecolor='white', edgecolor='white')
plt.show()

In [None]:
res = []
max_depth = range(1,55)



for depth in max_depth:
    print(f"Evaluating min_samples_leaf={depth}")
    model = RandomForestRegressor(random_state=42, max_depth=depth)
    model.fit(x_train_split, y_train_split)
    
    # Predict on training data
    y_train_pred = model.predict(x_train_split)
    train_MSE = MSE(y_train_split,y_train_pred)
    train_RMSE = train_MSE**(1/2)
    
    # Predict on validation data
    y_val_pred = model.predict(x_val)
    val_MSE = MSE(y_val,y_val_pred)
    val_RMSE = val_MSE**(1/2)

    res.append({
        'depth': depth, 
        'train_rmse': train_RMSE, 
        'val_rmse': val_RMSE
    })

# Convert the list of results to a DataFrame
res = pd.DataFrame(res)

# Plotting the results
plt.figure(figsize=(13, 4))
plt.plot(res['depth'], res['train_rmse'], marker='o', markersize=4)
plt.plot(res['depth'], res['val_rmse'], marker='o', markersize=4)
plt.legend(['Train RMSE', 'Validation RMSE'])
plt.xlabel('Depth')
plt.ylabel('RMSE')
plt.title('RMSE by Max depth - Decision tree')
plt.savefig('max_depth_decision_tree.png', facecolor='white', edgecolor='white')
plt.show()

In [None]:
# max depth 
res = []
estimators_list = range(10,200)
for estimators_number in estimators_list:
    print(estimators_number)
    model = RandomForestRegressor(random_state=42, n_jobs=-1, n_estimators=estimators_number)
    model.fit(X_train, y_train)
    
    # Predict on training data
    y_train_pred = model.predict(x_train_split)
    train_MSE = MSE(y_train_split,y_train_pred)
    train_RMSE = train_MSE**(1/2)
    
    # Predict on validation data
    y_val_pred = model.predict(x_val)
    val_MSE = MSE(y_val,y_val_pred)
    val_RMSE = val_MSE**(1/2)
    
    res.append({'n_estimators': estimators_number, 
                      'train_RMSE': train_RMSE, 
                      'val_RMSE': val_RMSE, 
                      })
 
 # Convert the list of results to a DataFrame
res = pd.DataFrame(res)   
# Plotting the results
plt.figure(figsize=(13, 4))
plt.plot(res['n_estimators'], res['train_RMSE'], marker='o', markersize=4)
plt.plot(res['n_estimators'], res['val_RMSE'], marker='o', markersize=4)
plt.legend(['Train RMSE', 'Validation RMSE'])
plt.xlabel('number of estimators')
plt.ylabel('RMSE')
plt.title('RMSE by Estimators number', fontweight='bold')
plt.savefig('number of estimators - forest.png', facecolor='white', edgecolor='white')
plt.show()

## Hyperparameter tuning - Random Forest

In [None]:
# Decision tree 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tree_model = RandomForestRegressor(random_state=42)
param_grid = {
    'n_estimators': [50,60,70,80,90,100,110,120,130,140,150,160,170,180,190,200],
    'min_samples_leaf': np.arange(10,60),
    'max_depth': list(np.arange(10, 40)) + [None],
}

rand_search = RandomizedSearchCV(estimator=tree_model, param_distributions=param_grid, scoring='neg_mean_squared_error' , cv=kf, n_jobs=-1, refit=True, return_train_score=True, n_iter=600, random_state=42)
rand_search.fit(X_train, y_train)

# Best model and its hyperparameters
best_model = rand_search.best_estimator_
best_hyperparams = rand_search.best_params_
print("Best hyperparameters:", best_hyperparams)

# Predictions on the training set
y_train_pred = best_model.predict(X_train)
train_mse = MSE(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
print(f"Train set MSE: {train_mse:.3f}, RMSE: {train_rmse:.3f}")

# Predictions on the test set
y_test_pred = best_model.predict(X_test)
test_mse = MSE(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
print(f"Test set MSE: {test_mse:.3f}, RMSE: {test_rmse:.3f}")


# cv_results_
cv_results = pd.DataFrame(rand_search.cv_results_)
print(cv_results.columns)

In [None]:
# CV results
cv_results = pd.DataFrame(rand_search.cv_results_)
selected_columns = ['params', 'mean_train_score', 'mean_test_score', 'std_test_score', 'rank_test_score']
df_selected = cv_results[selected_columns]

# Display the CV results DataFrame
print(df_selected)

In [None]:
# Use the adjusted function to plot the 10 best and 10 worst configurations
plot_configurations(result_df=df_selected, is_best=True, model_name='Random Forest')
plot_configurations(result_df=df_selected, is_best=False, model_name='Random Forest')

In [None]:
# Get feature importance
feature_importance = best_model.feature_importances_

# dataframe of feature importance
feature_names = X_train.columns 
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df = importance_df[importance_df['Importance']>0]
# Plot the feature importance
plt.figure(figsize=(14, 6))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis() 
plt.xlabel('Importance')
plt.title('Feature Importance - Random Forest', fontweight='bold')
plt.savefig('Feature Importance- Random Forest.png', facecolor='white', edgecolor='white')
plt.show()

### Default - XGboost

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

# default model 
default_tree = GradientBoostingRegressor(random_state=42)


print(default_tree.get_params())
default_tree.fit(X_train, y_train)

y_train_pred = default_tree.predict(X_train)
train_MSE = MSE(y_train,y_train_pred)
train_RMSE = train_MSE**(1/2)
print(f'MSE on train set: {train_MSE}')
print(f'RMSE on train set: {train_RMSE}')

y_test_pred = default_tree.predict(X_test)
test_MSE = MSE(y_test,y_test_pred)
test_RMSE = test_MSE**(1/2)
print(f'train MSE on test set: {test_MSE}')
print(f'RMSE on test set: {test_RMSE}')


# finding parameters values in default
num_trees = len(default_tree.estimators_)
max_depths = []
samples_per_leaf = []

for tree in default_tree.estimators_[:, 0]: 
    max_depths.append(tree.tree_.max_depth)
    
    n_node_samples = tree.tree_.n_node_samples
    is_leaf = tree.tree_.children_left == -1  
    leaf_samples = n_node_samples[is_leaf]
    samples_per_leaf.extend(leaf_samples)
    
print(f"Average maximum depth: {sum(max_depths) / num_trees}")
print(f"Mean samples per leaf: {np.mean(samples_per_leaf)}")

In [None]:
# split only for plotting 
x_train_split, x_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

res = []
n_estimators = range(1, 200)



for estimator_num in n_estimators:
    print(f"Evaluating min_samples_leaf={estimator_num}")
    model = GradientBoostingRegressor(random_state=42, n_estimators=estimator_num)
    model.fit(x_train_split, y_train_split)
    
    # Predict on training data
    y_train_pred = model.predict(x_train_split)
    train_MSE = MSE(y_train_split,y_train_pred)
    train_RMSE = train_MSE**(1/2)
    
    # Predict on validation data
    y_val_pred = model.predict(x_val)
    val_MSE = MSE(y_val,y_val_pred)
    val_RMSE = val_MSE**(1/2)

    res.append({
        'number_of_estimators': estimator_num, 
        'train_rmse': train_RMSE, 
        'val_rmse': val_RMSE
    })

# Convert the list of results to a DataFrame
res = pd.DataFrame(res)

# Plotting the results
plt.figure(figsize=(13, 4))
plt.plot(res['number_of_estimators'], res['train_rmse'], marker='o', markersize=4)
plt.plot(res['number_of_estimators'], res['val_rmse'], marker='o', markersize=4)
plt.legend(['Train RMSE', 'Validation RMSE'])
plt.xlabel('Number of Estimators')
plt.ylabel('RMSE')
plt.title('RMSE by estimators number - gradient Boosting', fontweight='bold')
plt.savefig('number_of_estimators - XGBoost.png', facecolor='white', edgecolor='white')
plt.show()

In [None]:
# split only for plotting 

res = []
max_depth = range(1, 15)



for depth in max_depth:
    print(f"Evaluating min_samples_leaf={depth}")
    model = GradientBoostingRegressor(random_state=42, max_depth=depth)
    model.fit(x_train_split, y_train_split)
    
    # Predict on training data
    y_train_pred = model.predict(x_train_split)
    train_MSE = MSE(y_train_split,y_train_pred)
    train_RMSE = train_MSE**(1/2)
    
    # Predict on validation data
    y_val_pred = model.predict(x_val)
    val_MSE = MSE(y_val,y_val_pred)
    val_RMSE = val_MSE**(1/2)

    res.append({
        'depth': depth, 
        'train_rmse': train_RMSE, 
        'val_rmse': val_RMSE
    })

# Convert the list of results to a DataFrame
res = pd.DataFrame(res)

# Plotting the results
plt.figure(figsize=(13, 4))
plt.plot(res['depth'], res['train_rmse'], marker='o', markersize=4)
plt.plot(res['depth'], res['val_rmse'], marker='o', markersize=4)
plt.legend(['Train RMSE', 'Validation RMSE'])
plt.xlabel('depth')
plt.ylabel('RMSE')
plt.title('RMSE by depth - Gradient Boosting', fontweight='bold')
plt.savefig('Max Depth - XGBoost.png', facecolor='white', edgecolor='white')
plt.show()

## Hyperparameter tuning - XGboost 

In [None]:
# Decision tree 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tree_model = GradientBoostingRegressor(random_state=42)
param_grid = {
    'n_estimators': list(np.arange(10,50)),
    'max_depth': [2,3,4],
    'learning_rate': [0.01,0.05,0.1]
    
}

rand_search = RandomizedSearchCV(estimator=tree_model, param_distributions=param_grid, scoring='neg_mean_squared_error' , cv=kf, n_jobs=-1, refit=True, return_train_score=True, n_iter=300, random_state=42)
rand_search.fit(X_train, y_train)

# Best model and its hyperparameters
best_model = rand_search.best_estimator_
best_hyperparams = rand_search.best_params_
print("Best hyperparameters:", best_hyperparams)

# Predictions on the training set
y_train_pred = best_model.predict(X_train)
train_mse = MSE(y_train, y_train_pred)
train_rmse = np.sqrt(train_mse)
print(f"Train set MSE: {train_mse:.3f}, RMSE: {train_rmse:.3f}")

# Predictions on the test set
y_test_pred = best_model.predict(X_test)
test_mse = MSE(y_test, y_test_pred)
test_rmse = np.sqrt(test_mse)
print(f"Test set MSE: {test_mse:.3f}, RMSE: {test_rmse:.3f}")


# cv_results_
cv_results = pd.DataFrame(rand_search.cv_results_)
print(cv_results.columns)

In [None]:
# CV results
cv_results = pd.DataFrame(rand_search.cv_results_)
selected_columns = ['params', 'mean_train_score', 'mean_test_score', 'std_test_score', 'rank_test_score']
df_selected = cv_results[selected_columns]

# Display the CV results DataFrame
print(df_selected)

In [None]:
# Use the adjusted function to plot the 10 best and 10 worst configurations
plot_configurations(result_df=df_selected, is_best=True, model_name='Gradient Boosting')
plot_configurations(result_df=df_selected, is_best=False, model_name='Gradient Boosting')

In [None]:
# Get feature importance
feature_importance = best_model.feature_importances_

# dataframe of feature importance
feature_names = X_train.columns 
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df = importance_df[importance_df['Importance']>0]
# Plot the feature importance
plt.figure(figsize=(20, 10))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.gca().invert_yaxis() 
plt.xlabel('Importance')
plt.title('Feature Importance - Gradient Boosting', fontweight='bold')
plt.savefig('Feature Importance- Gradient Boosting', facecolor='white', edgecolor='white')
plt.show()