In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing necessary libraries

In [None]:
#pip install --upgrade scikit-learnA

In [None]:
import matplotlib.pyplot as plt
import re
import string
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

# Loading the data

In [None]:
movies_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/movies.csv')
sample_data = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/sample.csv')
test = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/test.csv')
train = pd.read_csv('/kaggle/input/sentiment-prediction-on-movie-reviews/train.csv')

### Creating copies of the data for further manipulation

In [None]:
movie_copy = movies_data 
test_copy = test 
train_copy = train

## Movies_data

### Let us look at the movies dataset first

In [None]:
movie_copy.describe()

In [None]:
movie_copy.info()

In [None]:
movie_copy.isna().sum()

- We observe over here that the columns : **'rating','ratingContents','releaseDateTheaters','boxOffice','distributor','soundType'** have over **1,00,000 null values**. Imputing these values to train our model will lead to biased predictions as majority of the data will be synthetic data. Hence we will **drop** these columns

In [None]:
movies_duplicate = movie_copy.duplicated()

# Get the rows that are duplicates
duplicate_rows = movie_copy[movies_duplicate]

# Display the duplicate rows
duplicate_rows


## Summary of the movies dataset

> The columns : **'rating','ratingContents','releaseDateTheaters','boxOffice','distributor','soundType'** have over **1,00,000** null values and imputing these columns might be a bad idea. 

> **Removing** them would be a better idea. 


> There are **1571 duplicate** entries in the dataset. 

# Exploratory Data Analysis & Data pre-processing

## Train dataset

### Let us take a look at the train dataset

In [None]:
train_copy

In [None]:
train_copy.info()

In [None]:
train_copy.describe()

- From the above, we see that we have a total of **162758** entries in train data.
- There are **16812** unique **movieid**.
- There are **4482** unique **reviewerName**.


In [None]:
train['sentiment'].value_counts()

In [None]:
missing_train = train_copy.isna().sum()
missing_train

- We see that there are **6447** null entries in our train dataset. This is a thing of concern for us and we need to find a way to combat these

In [None]:
missing_train_percent = missing_train/len(train)*100

print("Hence we see that the percentage of missing train['reviewText'] data  is:",missing_train_percent['reviewText'])

In [None]:
reviewer_counts=train_copy[train_copy['isFrequentReviewer'] == True]['reviewerName'].value_counts()
reviewer_counts

- The above result tells us that we have **John Luna** who has reviewed the most, followed by **Bryan Phillips**

In [None]:
# Selecting the top 10 reviewers
top_10_reviewers = reviewer_counts.head(10)

# Creating the bar plot
plt.figure(figsize=(10, 6))
plt.bar(top_10_reviewers.index, top_10_reviewers.values, color='skyblue')
plt.xticks(rotation=90)  # Rotating x-axis labels for better readability
plt.xlabel('Reviewer Name')
plt.ylabel('Number of Reviews')
plt.title('Top 10 Frequent Reviewers by Number of Reviews')
plt.tight_layout()
plt.show()

- We can see above the list of the 10 most frequent reviewers.

In [None]:
# Now let us look at the imbalance in the sentiment if there's any present

sentiment_count = train_copy['sentiment'].value_counts()
sentiment_count

In [None]:
sentiment_percent = sentiment_count/len(train) * 100
sentiment_percent

In [None]:
positive_percentage = 66.823751
negative_percentage = 33.176249

# Data for the pie chart
labels = ['Positive', 'Negative']
sizes = [positive_percentage, negative_percentage]
colors = ['skyblue', 'lightcoral']

# Create the pie chart without explode and shadow
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Sentiment Distribution')
plt.show()


- There's almost twice as much of POSTIVE sentimented reviews versus NEGATIVE sentimented reviews. This suggests class imbalance. 
- Applying class imbalance techniques like **SMOTE, RandomOverSampler, RandomUnderSampler or ADYSN** might be a good idea.

# Summary of Train dataset

> We have a total of **162758** entries in train dataset. Of these, there are **16812** unique **movieid** and there are **4482** unique reviewerName.

> About **3.9610 %** of reviewName column has null values.

> **John Luna** has reviewed the most, followed by **Bryan Phillips**

> The train dataset is **imbalanced** with **66.823751% Positive** sentiment reviews and **33.176249% Negative** sentimented reviews

## Test data

### Let us take a look at the train dataset

In [None]:
# A look at the test data

test_copy

- Shape of the test data is **(55315,4)**

In [None]:
missing_test = test_copy.isna().sum()
missing_test

In [None]:
missing_test_percent = missing_test/len(train)*100

print("Hence we see that the percentage of missing test['reviewText'] data  is:",missing_test_percent['reviewText'])

# Summary of Test dataset

> The dataset has a shape of **(55315,4)**.

> About **2510** entries or **1.542%** of reviewName column has null values.

## Data Pre-processing

In [None]:
#Initially we will encode the sentiment column in train dataset

label_encoder = LabelEncoder()
train_copy['sentiment'] = label_encoder.fit_transform(train_copy['sentiment'])

In [None]:
train_copy['isFrequentReviewer'] = label_encoder.fit_transform(train_copy['isFrequentReviewer'])

In [None]:
#Filling in the missing values

train_copy['reviewText'].fillna('missing', inplace=True)
train_copy.isna().sum()

- We see there are no missing values in our train dataset now. Similarly we will do this for our test dataset

In [None]:
# Applying similar techniques for the test dataset

test_copy['reviewText'].fillna('missing', inplace=True)
test_copy.isna().sum()

## Pre-processing Functions

#### Let us define some functions to clean up our reviewText column

___________________________________________________________________________________________________________________________________________

In [None]:
import re

# Function to cleanup the review column to prepare it for modeling
def clean_review(review):
    # Convert the review to lowercase
    review = str(review).lower()
    
    # Remove URLs
    review = re.sub(r'https?://\S+', '', review)
    
    # Remove HTML tags
    review = re.sub(r'<[^>]*>', '', review)
    
    # Remove newlines
    review = re.sub('\n', ' ', review)
    
    # Remove alphanumeric words and single letters
    review = re.sub(r'\b\w\b', '', review)
    
    # Remove punctuation, excluding specific characters
    review = re.sub(r'[^\w\s!@$%^&*(),.?":{}|<>]', '', review)
    
    # Remove numbers and apostrophes
    review = re.sub(r'\b\d+\b', '', review)
    review = re.sub(r"'", '', review)

    return review


# For train DataFrame
train_copy['reviewText'] = train_copy['reviewText'].apply(lambda x: clean_review(x))

In [None]:
# Define a function to remove stop words from the reviewText column

stop_words = ["0o","0s","3a","3b","3d","6b","6o","a","a1","a2","a3","a4","ab","able","about","above","abst","ac","accordance","according","accordingly","across","act","actually","ad","added","adj","ae","af","affected","affecting","affects","after","afterwards","ag","again","against","ah","ain","ain't","aj","al","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","amoungst","amount","an","and","announce","another","any","anybody","anyhow","anymore","anyone","anything","anyway","anyways","anywhere","ao","ap","apart","apparently","appear","appreciate","appropriate","approximately","ar","are","aren","arent","aren't","arise","around","as","a's","aside","ask","asking","associated","at","au","auth","av","available","aw","away","awfully","ax","ay","az","b","b1","b2","b3","ba","back","bc","bd","be","became","because","become","becomes","becoming","been","before","beforehand","begin","beginning","beginnings","begins","behind","being","believe","below","beside","besides","best","better","between","beyond","bi","bill","biol","bj","bk","bl","bn","both","bottom","bp","br","brief","briefly","bs","bt","bu","but","bx","by","c","c1","c2","c3","ca","call","came","can","cannot","cant","can't","cause","causes","cc","cd","ce","certain","certainly","cf","cg","ch","changes","ci","cit","cj","cl","clearly","cm","c'mon","cn","co","com","come","comes","con","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn","couldnt","couldn't","course","cp","cq","cr","cry","cs","c's","ct","cu","currently","cv","cx","cy","cz","d","d2","da","date","dc","dd","de","definitely","describe","described","despite","detail","df","di","did","didn","didn't","different","dj","dk","dl","do","does","doesn","doesn't","doing","don","done","don't","down","downwards","dp","dr","ds","dt","du","due","during","dx","dy","e","e2","e3","ea","each","ec","ed","edu","ee","ef","effect","eg","ei","eight","eighty","either","ej","el","eleven","else","elsewhere","em","empty","en","end","ending","enough","entirely","eo","ep","eq","er","es","especially","est","et","et-al","etc","eu","ev","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","ey","f","f2","fa","far","fc","few","ff","fi","fifteen","fifth","fify","fill","find","fire","first","five","fix","fj","fl","fn","fo","followed","following","follows","for","former","formerly","forth","forty","found","four","fr","from","front","fs","ft","fu","full","further","furthermore","fy","g","ga","gave","ge","get","gets","getting","gi","give","given","gives","giving","gj","gl","go","goes","going","gone","got","gotten","gr","greetings","gs","gy","h","h2","h3","had","hadn","hadn't","happens","hardly","has","hasn","hasnt","hasn't","have","haven","haven't","having","he","hed","he'd","he'll","hello","help","hence","her","here","hereafter","hereby","herein","heres","here's","hereupon","hers","herself","hes","he's","hh","hi","hid","him","himself","his","hither","hj","ho","home","hopefully","how","howbeit","however","how's","hr","hs","http","hu","hundred","hy","i","i2","i3","i4","i6","i7","i8","ia","ib","ibid","ic","id","i'd","ie","if","ig","ignored","ih","ii","ij","il","i'll","im","i'm","immediate","immediately","importance","important","in","inasmuch","inc","indeed","index","indicate","indicated","indicates","information","inner","insofar","instead","interest","into","invention","inward","io","ip","iq","ir","is","isn","isn't","it","itd","it'd","it'll","its","it's","itself","iv","i've","ix","iy","iz","j","jj","jr","js","jt","ju","just","k","ke","keep","keeps","kept","kg","kj","km","know","known","knows","ko","l","l2","la","largely","last","lately","later","latter","latterly","lb","lc","le","least","les","less","lest","let","lets","let's","lf","like","liked","likely","line","little","lj","ll","ll","ln","lo","look","looking","looks","los","lr","ls","lt","ltd","m","m2","ma","made","mainly","make","makes","many","may","maybe","me","mean","means","meantime","meanwhile","merely","mg","might","mightn","mightn't","mill","million","mine","miss","ml","mn","mo","more","moreover","most","mostly","move","mr","mrs","ms","mt","mu","much","mug","must","mustn","mustn't","my","myself","n","n2","na","name","namely","nay","nc","nd","ne","near","nearly","necessarily","necessary","need","needn","needn't","needs","neither","never","nevertheless","new","next","ng","ni","nine","ninety","nj","nl","nn","no","nobody","non","none","nonetheless","noone","nor","normally","nos","not","noted","nothing","novel","now","nowhere","nr","ns","nt","ny","o","oa","ob","obtain","obtained","obviously","oc","od","of","off","often","og","oh","oi","oj","ok","okay","ol","old","om","omitted","on","once","one","ones","only","onto","oo","op","oq","or","ord","os","ot","other","others","otherwise","ou","ought","our","ours","ourselves","out","outside","over","overall","ow","owing","own","ox","oz","p","p1","p2","p3","page","pagecount","pages","par","part","particular","particularly","pas","past","pc","pd","pe","per","perhaps","pf","ph","pi","pj","pk","pl","placed","please","plus","pm","pn","po","poorly","possible","possibly","potentially","pp","pq","pr","predominantly","present","presumably","previously","primarily","probably","promptly","proud","provides","ps","pt","pu","put","py","q","qj","qu","que","quickly","quite","qv","r","r2","ra","ran","rather","rc","rd","re","readily","really","reasonably","recent","recently","ref","refs","regarding","regardless","regards","related","relatively","research","research-articl","respectively","resulted","resulting","results","rf","rh","ri","right","rj","rl","rm","rn","ro","rq","rr","rs","rt","ru","run","rv","ry","s","s2","sa","said","same","saw","say","saying","says","sc","sd","se","sec","second","secondly","section","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","sf","shall","shan","shan't","she","shed","she'd","she'll","shes","she's","should","shouldn","shouldn't","should've","show","showed","shown","showns","shows","si","side","significant","significantly","similar","similarly","since","sincere","six","sixty","sj","sl","slightly","sm","sn","so","some","somebody","somehow","someone","somethan","something","sometime","sometimes","somewhat","somewhere","soon","sorry","sp","specifically","specified","specify","specifying","sq","sr","ss","st","still","stop","strongly","sub","substantially","successfully","such","sufficiently","suggest","sup","sure","sy","system","sz","t","t1","t2","t3","take","taken","taking","tb","tc","td","te","tell","ten","tends","tf","th","than","thank","thanks","thanx","that","that'll","thats","that's","that've","the","their","theirs","them","themselves","then","thence","there","thereafter","thereby","thered","therefore","therein","there'll","thereof","therere","theres","there's","thereto","thereupon","there've","these","they","theyd","they'd","they'll","theyre","they're","they've","thickv","thin","think","third","this","thorough","thoroughly","those","thou","though","thoughh","thousand","three","throug","through","throughout","thru","thus","ti","til","tip","tj","tl","tm","tn","to","together","too","took","top","toward","towards","tp","tq","tr","tried","tries","truly","try","trying","ts","t's","tt","tv","twelve","twenty","twice","two","tx","u","u201d","ue","ui","uj","uk","um","un","under","unfortunately","unless","unlike","unlikely","until","unto","uo","up","upon","ups","ur","us","use","used","useful","usefully","usefulness","uses","using","usually","ut","v","va","value","various","vd","ve","ve","very","via","viz","vj","vo","vol","vols","volumtype","vq","vs","vt","vu","w","wa","want","wants","was","wasn","wasnt","wasn't","way","we","wed","we'd","welcome","well","we'll","well-b","went","were","we're","weren","werent","weren't","we've","what","whatever","what'll","whats","what's","when","whence","whenever","when's","where","whereafter","whereas","whereby","wherein","wheres","where's","whereupon","wherever","whether","which","while","whim","whither","who","whod","whoever","whole","who'll","whom","whomever","whos","who's","whose","why","why's","wi","widely","will","willing","wish","with","within","without","wo","won","wonder","wont","won't","words","world","would","wouldn","wouldnt","wouldn't","www","x","x1","x2","x3","xf","xi","xj","xk","xl","xn","xo","xs","xt","xv","xx","y","y2","yes","yet","yj","yl","you","youd","you'd","you'll","your","youre","you're","yours","yourself","yourselves","you've","yr","ys","yt","z","zero","zi","zz"]


# Define the function to remove stop words from a single review
def remove_stop_words_from_review(review, stop_words):
    words = review.split()  # Tokenize the review into words
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Remove stop words
    processed_review = ' '.join(filtered_words)  # Reconstruct the review without stop words
    return processed_review

In [None]:
train_copy.dtypes

In [None]:
train_copy['reviewText'] = train_copy['reviewText'].apply(lambda x: remove_stop_words_from_review(str(x), stop_words))

In [None]:
train_copy

____________________________________

In [None]:
#Let us look at the duplicates present in the train data

train_duplicate = train_copy[train_copy.duplicated(['movieid','reviewerName'],keep=False)]

train_duplicate.shape

In [None]:
#Grouping the duplicates together

train_duplicate.groupby(['movieid']).count()

In [None]:
# Checking for the shapes of the sentiments of the duplicates

train_duplicate[train_duplicate.sentiment == 1].shape, train_duplicate[train_duplicate.sentiment == 0].shape

In [None]:
## Let us clean up the test data now

test_copy['reviewText'] = test_copy['reviewText'].apply(lambda x: clean_review(x))
test_copy['reviewText'] = test_copy['reviewText'].apply(lambda x: remove_stop_words_from_review(str(x), stop_words))

In [None]:
test_copy

In [None]:
test_copy['isTopCritic'].value_counts()

In [None]:
#Encoding the isTopCritic column in test data

test_copy['isTopCritic'] = label_encoder.fit_transform(test_copy['isTopCritic'])

## Movies dataset- a recap

In [None]:
movie_copy.info()

In [None]:
# Function to determine the total percentage of null values in a column

def null_value_percent(df):
    percent = round(df.isnull().sum() / len(df) * 100, ndigits=2)
    null_column_percent = pd.DataFrame(percent, columns=['Percent'])
    return null_column_percent


In [None]:
null_value_percent(movie_copy)

- We will drop the columns having null values above 50%

In [None]:
movie_copy.describe(include='all')

In [None]:
movie_copy.movieid.value_counts()

From a previous discussion, we see that some of the null values have over 50% of missing data. Therefore we will drop these columns.

In [None]:
drop_col = ['rating','ratingContents','releaseDateTheaters','boxOffice','distributor','soundType']
drop_movie = movie_copy.drop(columns = drop_col)
drop_movie.head()

In [None]:
drop_movie.shape

In [None]:
# Next let us drop the duplicates from the movies data
duplicate_movie = drop_movie.drop_duplicates('movieid')
duplicate_movie

## Merging train and movies dataset

In [None]:
merged_train_data = pd.merge(train_copy, duplicate_movie, on='movieid', how='inner')
merged_train_data.head()

In [None]:
merged_train_data.shape

In [None]:
null_value_percent(merged_train_data)

> **Some columns still have null values and we need to fix them**

In [None]:
merged_train_data['audienceScore'].isna().sum()

### Functions for imputation and plotting a histogram

____________________________________________________________________________________________________________________________________________________

In [None]:
from sklearn.impute import SimpleImputer

def impute_column_with_median(data, column):
    
    # Create the SimpleImputer with the desired strategy and fill_value (median)
    simple_imputer = SimpleImputer(strategy='median')
    
    # Reshape the column into a 2D array
    column_2d = data[column].values.reshape(-1, 1)
    
    # Impute the missing values using SimpleImputer
    imputed_column = simple_imputer.fit_transform(column_2d).flatten()
    
    # Update the DataFrame with the imputed column
    data[column] = imputed_column


In [None]:
def impute_column_with_most_frequent(data, column):
    
    # Create the SimpleImputer with the desired strategy (most_frequent)
    simple_imputer = SimpleImputer(strategy='most_frequent')
    
    # Reshape the column into a 2D array
    column_2d = data[column].values.reshape(-1, 1)
    
    # Impute the missing values using SimpleImputer
    imputed_column = simple_imputer.fit_transform(column_2d).flatten()
    
    # Update the DataFrame with the imputed column
    data[column] = imputed_column


In [None]:
def plot_histogram(data, column, bins=10, edgecolor='black'):
    
    # Extract the column data
    column_data = data[column]

    # Plot the distribution using a histogram
    plt.hist(column_data, bins=bins, edgecolor=edgecolor)

    # Set the labels and title
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.title(f'Distribution of {column}')

    # Display the plot
    plt.show()


__________________________________________________________________________________________________________________________________________________

In [None]:
#Plotting the histogram before imputation for audienceScore

plot_histogram(merged_train_data, 'audienceScore', bins=10, edgecolor='blue')

In [None]:
impute_column_with_median(merged_train_data, 'audienceScore')

In [None]:
#Plotting the histogram after imputation for audienceScore

plot_histogram(merged_train_data, 'audienceScore', bins=15, edgecolor='blue')

> **We see that the distribution is maintained when using the median strategy except the fact that there is a spike at the median value**

In [None]:
null_value_percent(merged_train_data)

In [None]:
#Plotting the histogram before imputation for runtimeMinutes
plot_histogram(merged_train_data, 'runtimeMinutes', bins=10, edgecolor='green')

In [None]:
impute_column_with_median(merged_train_data, 'runtimeMinutes')

In [None]:
#Plotting the histogram after imputation for runtimeMinutes
plot_histogram(merged_train_data, 'runtimeMinutes', bins=10, edgecolor='green')

In [None]:
merged_train_data.originalLanguage.value_counts()

In [None]:
# Before imputation

# Get the value counts of each language
language_counts = merged_train_data['originalLanguage'].value_counts()

# Extract the top 5 languages
top_5_languages = language_counts.head(5)

# Plot the countplot for the top 5 languages
plt.figure(figsize=(10, 6))  
sns.countplot(y='originalLanguage', data=merged_train_data, order=top_5_languages.index)

# Set the labels and title
plt.xlabel('Count')
plt.ylabel('Original Language')
plt.title('Distribution of Top 5 Original Languages before imputation')

# Display the plot
plt.show()

In [None]:
impute_column_with_most_frequent(merged_train_data, 'originalLanguage')

In [None]:
# After imputation

# Get the value counts of each language
language_counts = merged_train_data['originalLanguage'].value_counts()

# Extract the top 5 languages
top_5_languages = language_counts.head(5)

# Plot the countplot for the top 5 languages
plt.figure(figsize=(10, 6))  
sns.countplot(y='originalLanguage', data=merged_train_data, order=top_5_languages.index)

# Set the labels and title
plt.xlabel('Count')
plt.ylabel('Original Language')
plt.title('Distribution of Top 5 Original Languages after imputation')

# Display the plot
plt.show()

- The graph above tells us about the **top 5** most frequently occurring languages in the OriginalLanguages column which shows that most of the reviews are in **English**. The graph above is plotted after imputing the missing values with the most_frequent strategy.

In [None]:
merged_train_data.genre.value_counts()

In [None]:
# Before imputation

# Get the value counts of each language
language_counts = merged_train_data['genre'].value_counts()

# Extract the top 5 languages
top_5_languages = language_counts.head(5)

# Plot the countplot for the top 5 languages
plt.figure(figsize=(10, 6))  
sns.countplot(y='genre', data=merged_train_data, order=top_5_languages.index)

# Set the labels and title
plt.xlabel('Count')
plt.ylabel('Genre')
plt.title('Distribution of Top 5 Genres before imputation')

# Display the plot
plt.show()

In [None]:

impute_column_with_most_frequent(merged_train_data, 'genre')

In [None]:
# After  imputation

# Get the value counts of each language
language_counts = merged_train_data['genre'].value_counts()

# Extract the top 5 languages
top_5_languages = language_counts.head(5)

# Plot the countplot for the top 5 languages
plt.figure(figsize=(10, 6))  
sns.countplot(y='genre', data=merged_train_data, order=top_5_languages.index)

# Set the labels and title
plt.xlabel('Count')
plt.ylabel('Genre')
plt.title('Distribution of Top 5 Genres after imputation')

# Display the plot
plt.show()

In [None]:
merged_train_data.releaseDateStreaming.value_counts()

In [None]:
# Group the data by 'streaming_date' and count the number of movies on each date
movies_streamed_by_date = merged_train_data['movieid'].groupby(merged_train_data['releaseDateStreaming']).count()

# Reset the index to convert the groupby result into a DataFrame
movies_streamed_by_date = movies_streamed_by_date.reset_index()

# Sort the DataFrame by count of movies in descending order
movies_streamed_by_date_sorted = movies_streamed_by_date.sort_values(by='movieid', ascending=False)

# Select the top 10 dates with the highest number of movies streamed
top_10_dates = movies_streamed_by_date_sorted.head(10)

# Display the top 10 dates
print(top_10_dates)

# Plot the line plot for the top 10 dates
plt.figure(figsize=(12, 6))  # Adjust the figure size if needed for better visibility
plt.plot(top_10_dates['releaseDateStreaming'], top_10_dates['movieid'], marker='o', linestyle='-', color='b')

# Set the labels and title
plt.xlabel('Streaming Date')
plt.ylabel('Number of Movies Streamed')
plt.title('Top 10 Dates with the Highest Number of Movies Streamed before imputation')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Display the plot
plt.show()


In [None]:
impute_column_with_most_frequent(merged_train_data, 'releaseDateStreaming')

In [None]:
# Group the data by 'streaming_date' and count the number of movies on each date
movies_streamed_by_date = merged_train_data['movieid'].groupby(merged_train_data['releaseDateStreaming']).count()

# Reset the index to convert the groupby result into a DataFrame
movies_streamed_by_date = movies_streamed_by_date.reset_index()

# Sort the DataFrame by count of movies in descending order
movies_streamed_by_date_sorted = movies_streamed_by_date.sort_values(by='movieid', ascending=False)

# Select the top 10 dates with the highest number of movies streamed
top_10_dates = movies_streamed_by_date_sorted.head(10)

# Display the top 10 dates
print(top_10_dates)

# Plot the line plot for the top 10 dates
plt.figure(figsize=(12, 6))  # Adjust the figure size if needed for better visibility
plt.plot(top_10_dates['releaseDateStreaming'], top_10_dates['movieid'], marker='o', linestyle='-', color='b')

# Set the labels and title
plt.xlabel('Streaming Date')
plt.ylabel('Number of Movies Streamed')
plt.title('Top 10 Dates with the Highest Number of Movies Streamed after imputation')

# Rotate the x-axis labels for better readability
plt.xticks(rotation=45)

# Display the plot
plt.show()


In [None]:
null_value_percent(merged_train_data)

In [None]:
merged_sentiment_count = merged_train_data['sentiment'].value_counts()
merged_sentiment_count

merged_sentiment_percent = merged_sentiment_count/len(merged_train_data) * 100
merged_sentiment_percent



In [None]:
positive_percentage = 66.823751
negative_percentage = 33.176249

# Data for the pie chart
labels = ['Positive', 'Negative']
sizes = [positive_percentage, negative_percentage]
colors = ['skyblue', 'lightcoral']

# Create the pie chart without explode and shadow
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Sentiment Distribution')
plt.show()


- We notice the **imbalance is maintained** even in the merged training and movies dataset.

_______________________________________________________________________________________________________________________

### Applying SMOTE to balance the merged movies and train dataset

In [None]:
# from sklearn.preprocessing import LabelEncoder
# from imblearn.over_sampling import SMOTE

# # Separate features and labels
# features = merged_train_data.drop('sentiment', axis=1)
# labels = merged_train_data['sentiment']

# # Apply LabelEncoder to categorical features
# label_encoders = {}
# for column in features.select_dtypes(include=['object']).columns:
#     le = LabelEncoder()
#     features[column] = le.fit_transform(features[column])
#     label_encoders[column] = le

# # Apply SMOTE
# smote = SMOTE(sampling_strategy='auto') # 'auto' balances classes
# features_resampled, labels_resampled = smote.fit_resample(features, labels)

# # Merge the resampled features and labels back into a dataframe
# merged_train_data_resampled = pd.concat([features_resampled, labels_resampled], axis=1)



In [None]:
# merged_sentiment_count = merged_train_data_resampled ['sentiment'].value_counts()
# merged_sentiment_count

# merged_sentiment_percent = merged_sentiment_count/len(merged_train_data_resampled ) * 100
# merged_sentiment_percent

In [None]:
# 1    50.0
# 0    50.0
# Name: sentiment, dtype: float64

In [None]:
positive_percentage = 50.0
negative_percentage = 50.0

# Data for the pie chart
labels = ['Positive', 'Negative']
sizes = [positive_percentage, negative_percentage]
colors = ['skyblue', 'lightcoral']

# Create the pie chart without explode and shadow
plt.figure(figsize=(6, 6))
plt.pie(sizes, labels=labels, colors=colors, autopct='%1.1f%%', startangle=140)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.
plt.title('Sentiment Distribution')
plt.show()


We see that our dataset is balanced now with 50% positive and 50% negative classes by applying **SMOTE**. However, on further analysis, it was found that balancing the dataset **did now bring any significant improvements to our model**, hence we will be proceeding forward with the imbalanced dataset and commenting out the above code for balancing the dataset.

#### Take-aways:

1. Applying SMOTE balanced our dataset.

2. However, it on further analysis there was **no significant improvement** to the models.

3. Hence, we will drop the idea of balancing our dataset and **proceed with the imbalanced dataset.**

______________________________________________________________________________________________________________________

### Intuitive EDA

In [None]:
sns.displot(data=merged_train_data, x='audienceScore', hue='sentiment', kind='kde')
# Set the labels and title
#plt.xlabel('Sentiment')
plt.ylabel('Audience Score')
plt.title('How Audience Score affects the sentiment ')

In [None]:
sns.displot(data=merged_train_data, x='runtimeMinutes', hue='sentiment', kind='kde')
# Set the labels and title
#plt.label('Sentiment')
plt.ylabel('Runtime Minutes')
plt.title('How runtime Minutes affects the sentiment ')

## Summary

> The missing values were **Imputed** for the various columns.

> **English** was the most frequently used languages,

> Most movies were **Drama** based.

> The **data imbalance** is conserved in the merged movies and train dataset.

> There seems to be a correlation between **audienceScore** and **sentiment** as well as **runtimeMinutes** and **sentiment**

## Merging test and movies dataset

In [None]:
merged_test_data = pd.merge(test_copy, duplicate_movie, on='movieid', how='left')
merged_test_data.head()

In [None]:
merged_test_data.shape

In [None]:
null_value_percent(merged_test_data)

In [None]:
# Using the same imputations technique used in the merged_train_dataset to handle missing values in merged_train_dataset

impute_column_with_median(merged_test_data, 'audienceScore')

impute_column_with_most_frequent(merged_test_data, 'releaseDateStreaming')

impute_column_with_most_frequent(merged_test_data, 'genre')

impute_column_with_most_frequent(merged_test_data, 'originalLanguage')

impute_column_with_median(merged_test_data, 'runtimeMinutes')


In [None]:
null_value_percent(merged_test_data)

> We have **removed all the missing values** from our **merged training and test dataset**. We can move forward to splitting the dataset for further treatment

## Scaling our data

In [None]:
# Use MinMax Scaler to scale the runtimeMinutes and audienceScore columns

from sklearn.preprocessing import MinMaxScaler

def min_max_scale(data, column):
    
    # Create the MinMaxScaler
    min_max_scaler = MinMaxScaler()
    
    # Reshape the column into a 2D array
    column_2d = data[column].values.reshape(-1, 1)
    
    # Apply Min-Max Scaling to the column in-place
    data[column] = min_max_scaler.fit_transform(column_2d)


In [None]:
# Scaling the columns of audienceScore, runtimeMinutes for both merged training and test data
min_max_scale(merged_train_data,'audienceScore')
min_max_scale(merged_train_data,'runtimeMinutes')


min_max_scale(merged_test_data,'audienceScore')
min_max_scale(merged_test_data,'runtimeMinutes')


# Splitting our dataset

In [None]:
# splitting the merged dataset into train and test set

merge_train, merge_test = train_test_split(merged_train_data, test_size = 0.2, stratify= merged_train_data.sentiment, random_state = 10)
merge_train.shape, merge_test.shape

In [None]:
positive_train = merge_train[merge_train.sentiment==1]
negative_train = merge_train[merge_train.sentiment==0]


positive_test = merge_test[merge_test.sentiment==1]
negative_test = merge_test[merge_test.sentiment==0]


In [None]:
#Check imbalance

(positive_train.shape,negative_train.shape),(positive_test.shape,negative_test.shape)


In [None]:
positive_train.describe()

In [None]:
negative_train.describe()

In [None]:
positive_test.describe()

In [None]:
negative_test.describe()

In [None]:
sns.heatmap(merged_train_data.corr(numeric_only=True),annot=True)

> **audienceScore** and **runtimeMinutes** seems to be correlated and is correlated to **sentiment** as well.

In [None]:
merged_train_data= merged_train_data.drop(columns = ['movieid','reviewerName','isFrequentReviewer','title','releaseDateStreaming','genre','originalLanguage','director'],axis=1)
merged_train_data

# Feature Selection

In [None]:
X = merged_train_data[['reviewText', 'audienceScore', 'runtimeMinutes']]
y = merged_train_data['sentiment']


In [None]:
X.shape,y.shape

In [None]:
# Split the data into training and testing sets with stratified sampling
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=10)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

## Evaluation metrics functions

_________________________________________________________________________________________________________________________________________________

In [None]:
#Function to print the classification report as well as the F1-micro score and classification matrix
from sklearn.metrics import classification_report, confusion_matrix, f1_score

def report(X_test, y_test, pipeline):
    # Check if the pipeline has a best_estimator_ attribute (for GridSearchCV)
    if hasattr(pipeline, 'best_estimator_'):
        best_estimator = pipeline.best_estimator_
        y_pred = best_estimator.predict(X_test)
    else:
        y_pred = pipeline.predict(X_test)

    # Print the classification report
    report_text = classification_report(y_test, y_pred)
    print("Classification Report:\n", report_text)
    print("-------------------------------------------------------------")

    # Calculate and print the F1-micro score
    f1_micro = f1_score(y_test, y_pred, average='micro')
    print("F1-micro Score:", f1_micro)
    print("-------------------------------------------------------------")

    # Plot the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='viridis', cbar=False)
    plt.xlabel('Predicted Labels')
    plt.ylabel('True Labels')
    plt.title('Confusion Matrix')
    plt.show()



In [None]:
#Function to plot the ROC curve and ROC AUC score


from sklearn.metrics import roc_curve, roc_auc_score

def plot_roc_curve(y_true, y_prob):
    # Calculate ROC curve
    fpr, tpr, _ = roc_curve(y_true, y_prob)

    # Calculate ROC AUC score
    roc_auc = roc_auc_score(y_true, y_prob)

    # Plot ROC curve
    plt.figure(figsize=(10, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    # Plot ROC AUC curve
    plt.figure(figsize=(6, 6))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.fill_between(fpr, tpr, alpha=0.5, color='lightblue')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    plt.show()

    # Print ROC AUC score
    print(f"ROC AUC: {roc_auc:.2f}")


In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer

def plot_precision_recall_curve_for_pipeline(pipeline, classifier_step, X_train, y_train):
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Predict probabilities on the training data
    if hasattr(pipeline.named_steps[classifier_step], 'predict_proba'):
        y_scores = pipeline.predict_proba(X_train)[:, 1]
    else:
        y_scores = pipeline.decision_function(X_train)
    
    # Calculate precision-recall values
    precision, recall, _ = precision_recall_curve(y_train, y_scores)
    
    # Calculate area under the curve (AUC)
    pr_auc = auc(recall, precision)
    
    # Plot the precision-recall curve
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='b', label='Precision-Recall curve (AUC = {:.2f})'.format(pr_auc))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend(loc='lower left')
    plt.grid(True)
    plt.show()




In [None]:
#Precision-Recall curve for Test set

def plot_precision_recall_curve_for_sets(pipeline, classifier_step, X_train, y_train, X_val, y_val):
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Plot for the training data
    plot_precision_recall_curve(pipeline, classifier_step, X_train, y_train, title='Training Set')
    
    # Plot for the validation data
    plot_precision_recall_curve(pipeline, classifier_step, X_val, y_val, title='Train Set')

def plot_precision_recall_curve(pipeline, classifier_step, X, y, title):
    # Predict probabilities
    if hasattr(pipeline.named_steps[classifier_step], 'predict_proba'):
        y_scores = pipeline.predict_proba(X)[:, 1]
    else:
        y_scores = pipeline.decision_function(X)
    
    # Calculate precision-recall values
    precision, recall, _ = precision_recall_curve(y, y_scores)
    
    # Calculate area under the curve (AUC)
    pr_auc = auc(recall, precision)
    
    # Plot the precision-recall curve
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, color='b', label='Precision-Recall curve (AUC = {:.2f})'.format(pr_auc))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve - {}'.format(title))
    plt.legend(loc='lower left')
    plt.grid(True)
    plt.show()

_________________________________________________________________________________________________________________________________________________

# Feature Extraction

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler

# Define the tfidf_transformer
tfidf_transformer = TfidfVectorizer()

# Define the numeric_transformer
numeric_transformer = MinMaxScaler()

tfidf_column = 'reviewText'  # No list here
numeric_columns = ['audienceScore', 'runtimeMinutes']

preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', tfidf_transformer, tfidf_column),  # No list here
        ('numeric', numeric_transformer, numeric_columns)
    ])

# Modelling

# Logistic Regression

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
# Create the pipeline with preprocessor and Linear Regression
pipe_log = Pipeline([
    ('preprocessor', preprocessor),
    ('logit', LogisticRegression(max_iter=1000))
])

In [None]:
# Fit the pipeline on the training data
pipe_log.fit(X_train, y_train)

In [None]:
# Make predictions on the test data
y_pred_test = pipe_log.predict(X_test)

In [None]:
y_pred_train = pipe_log.predict(X_train)

report(X_train,y_train,pipe_log)

In [None]:
# Evaluate the model on the test data
report(X_test, y_test, pipe_log)

In [None]:
plot_precision_recall_curve_for_sets(pipe_log, 'logit', X_train, y_train, X_test, y_test)

# NaiveBayes

> ### Multinomial Naive Bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
# Create a pipeline for Naive Bayes classifier
pipe_nb = Pipeline([
    ('preprocessor', preprocessor),
    ('nb', MultinomialNB())
])

In [None]:
# Fit the pipeline on the training data
pipe_nb.fit(X_train, y_train)

In [None]:
# Predict the labels on the test data
y_pred = pipe_nb.predict(X_test)

In [None]:
report(X_train,y_train,pipe_nb)

In [None]:
report(X_test, y_test, pipe_nb)

In [None]:
y_pred_prob = pipe_nb.predict_proba(X_test)[:, 1]
plot_roc_curve(y_test,y_pred)

In [None]:
plot_precision_recall_curve_for_sets(pipe_nb, 'nb', X_train, y_train, X_test, y_test)

> ### Complement NB

In [None]:
from sklearn.naive_bayes import ComplementNB
# Create a pipeline for Naive Bayes classifier
pipe_cnb = Pipeline([
    ('preprocessor', preprocessor),
    ('nb', ComplementNB())
])

In [None]:
# Fit the pipeline on the training data
pipe_cnb.fit(X_train, y_train)

In [None]:
# Predict the labels on the test data
y_pred = pipe_cnb.predict(X_test)

In [None]:
report(X_train,y_train,pipe_cnb)

In [None]:
report(X_test, y_test, pipe_cnb)

In [None]:
plot_precision_recall_curve_for_sets(pipe_cnb, 'nb', X_train, y_train, X_test, y_test)

In [None]:
y_pred_prob = pipe_cnb.predict_proba(X_test)[:, 1]
plot_roc_curve(y_test,y_pred)

# SGD Regressor

In [None]:
from sklearn.linear_model import SGDClassifier

pipe_sgd = Pipeline([
    ('preprocessor', preprocessor),
    ('sgd', SGDClassifier(random_state=5))
])

In [None]:
# Fit the pipeline on the training data
pipe_sgd.fit(X_train, y_train)

In [None]:
# Predict the labels on the test data
y_pred = pipe_sgd.predict(X_test)

In [None]:
report(X_train,y_train,pipe_sgd)

In [None]:
report(X_test, y_test, pipe_sgd)

In [None]:
plot_precision_recall_curve_for_sets(pipe_sgd, 'sgd', X_train, y_train, X_test, y_test)

In [None]:
#y_pred_prob = pipe_sgd.predict_proba(X_test)[:, 1]
plot_roc_curve(y_test,y_pred)

# Light Gradient Boosting Model Classifier 

In [None]:
from lightgbm import LGBMClassifier

pipe_lgbm = Pipeline([
    ('preprocessor', preprocessor),  # TF-IDF vectorizer
    ('lgbm', LGBMClassifier(random_state=5))  # LightGBM classifier
])

In [None]:
pipe_lgbm.fit(X_train,y_train)

In [None]:
y_pred = pipe_lgbm.predict(X_test)

In [None]:
report(X_train,y_train,pipe_lgbm)

In [None]:
report(X_test, y_test, pipe_lgbm)

In [None]:
plot_precision_recall_curve_for_sets(pipe_lgbm, 'lgbm', X_train, y_train, X_test, y_test)

In [None]:
# Calculate predicted probabilities for ROC curve
y_prob_test = pipe_lgbm.predict_proba(X_test)[:, 1]

# Plot ROC curve and calculate ROC AUC score
plot_roc_curve(y_test, y_prob_test)

# Linear SVC

In [None]:
from sklearn.svm import LinearSVC

pipe_svc = Pipeline([
    ('preprocessor', preprocessor),  # TF-IDF vectorizer
    ('svc', LinearSVC(random_state=5,max_iter=2000))  # LinearSVC classifier
])

In [None]:
pipe_svc.fit(X_train,y_train)

In [None]:
y_pred = pipe_svc.predict(X_test)

In [None]:
report(X_train,y_train,pipe_svc)

In [None]:
report(X_test, y_test, pipe_svc)

In [None]:
# Plot the precision_recall curve
plot_precision_recall_curve_for_sets(pipe_svc, 'svc', X_train, y_train, X_test, y_test)

### Calibrated SVC

In [None]:
from sklearn.calibration import CalibratedClassifierCV

# Create a calibrated classifier from the LinearSVC
calibrated_svc = CalibratedClassifierCV(pipe_svc)

In [None]:
calibrated_svc.fit(X_train,y_train)

In [None]:
# Calculate predicted probabilities for ROC curve
y_prob_test = calibrated_svc.predict_proba(X_test)[:, 1]

# Plot ROC curve and calculate ROC AUC score
plot_roc_curve(y_test, y_prob_test)

In [None]:
report(X_train,y_train,calibrated_svc)

In [None]:
# Print the evaluation report
report(X_test, y_test, calibrated_svc)

# XGBoost

In [None]:
from xgboost import XGBClassifier

# Create a pipeline for XGBoost sentiment analysis
pipe_xgboost = Pipeline([
    ('preprocessor', preprocessor),  
    ('xgboost', XGBClassifier(random_state=42))  # XGBoost classifier
    ])


In [None]:
# Fit the pipeline on the training data
pipe_xgboost.fit(X_train, y_train)

In [None]:
# Predict on the test data
y_pred = pipe_xgboost.predict(X_test)

In [None]:
report(X_train,y_train,pipe_xgboost)

In [None]:
report(X_test, y_test, pipe_xgboost)

In [None]:
# Plot the precision_recall curve
plot_precision_recall_curve_for_sets(pipe_xgboost, 'xgboost', X_train, y_train, X_test, y_test)

# Comparative Analysis of models

## Precision-Recall curve comparision

In [None]:
# List of pipelines
pipelines = [
    (pipe_log, 'logit'),
    (pipe_nb, 'nb'),
    (pipe_sgd, 'sgd'),
    (pipe_lgbm, 'lgbm'),
    (pipe_svc, 'svc'),
    (pipe_xgboost, 'xgboost')
]

In [None]:
#Aggregrate PR curves on Training set

def agg_precision_recall_curve_train(pipelines, X_train, y_train, title):
    plt.figure(figsize=(10, 6))
  
    for pipeline, classifier_step in pipelines:
        # Fit the pipeline on the training data
        pipeline.fit(X_train, y_train)
        
        # Predict probabilities or decision function
        if hasattr(pipeline.named_steps[classifier_step], 'predict_proba'):
            y_scores = pipeline.predict_proba(X_train)[:, 1]
        else:
            y_scores = pipeline.decision_function(X_train)
        
        # Calculate precision-recall values
        precision, recall, _ = precision_recall_curve(y_train, y_scores)
        
        # Calculate area under the curve (AUC)
        pr_auc = auc(recall, precision)
        
        # Plot the precision-recall curve
        plt.plot(recall, precision, label='{} (AUC = {:.2f})'.format(classifier_step, pr_auc))

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()
    
agg_precision_recall_curve_train(pipelines, X_train, y_train, 'Precision-Recall Curves (Training Data)')

In [None]:
#Aggregrate PR curves on Test set

def agg_precision_recall_curve_test(pipelines, X_test, y_test, title):
    plt.figure(figsize=(10, 6))
  
    for pipeline, classifier_step in pipelines:
        # Predict probabilities or decision function
        if hasattr(pipeline.named_steps[classifier_step], 'predict_proba'):
            y_scores = pipeline.predict_proba(X_test)[:, 1]
        else:
            y_scores = pipeline.decision_function(X_test)
        
        # Calculate precision-recall values
        precision, recall, _ = precision_recall_curve(y_test, y_scores)
        
        # Calculate area under the curve (AUC)
        pr_auc = auc(recall, precision)
        
        # Plot the precision-recall curve
        plt.plot(recall, precision, label='{} (AUC = {:.2f})'.format(classifier_step, pr_auc))

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()
    
agg_precision_recall_curve_test(pipelines, X_test, y_test, 'Precision-Recall Curves (Test Data)')

## F1-micro score comparision

In [None]:
# Initialize lists to store F1-micro scores
f1_micro_train = []
f1_micro_test = []
labels = []

# Define a function to calculate and plot F1-micro scores
def calculate_f1_micro(pipeline, classifier_step, X_train, y_train, X_test, y_test, label_prefix):
    # Fit the pipeline on the training data
    pipeline.fit(X_train, y_train)
    
    # Predict on the training and test data
    y_pred_train = pipeline.predict(X_train)
    y_pred_test = pipeline.predict(X_test)
    
    # Calculate F1-micro scores
    f1_train = f1_score(y_train, y_pred_train, average='micro')
    f1_test = f1_score(y_test, y_pred_test, average='micro')
    
    # Append to lists
    f1_micro_train.append(f1_train)
    f1_micro_test.append(f1_test)
    labels.append(label_prefix)

# Iterate through the pipelines to calculate F1-micro scores
for pipeline, classifier_step in pipelines:
    calculate_f1_micro(pipeline, classifier_step, X_train, y_train, X_test, y_test, classifier_step)

# Plot the bar graph
plt.figure(figsize=(10, 6))
x = range(len(labels))
plt.bar(x, f1_micro_train, width=0.4, align='center', label='Training')
plt.bar(x, f1_micro_test, width=0.4, align='edge', label='Test')
plt.xticks(x, labels)
plt.xlabel('Model')
plt.ylabel('F1-micro Score')
plt.title('F1-micro Scores (Train vs Test)')
plt.legend()
plt.grid(True)
plt.show()


In [None]:
from sklearn.metrics import f1_score

# Lists to store the F1-micro scores
f1_scores_train = []
f1_scores_test = []

# Calculate F1-micro scores for each pipeline
for pipeline, classifier_step in pipelines:
    pipeline_name = classifier_step  # You can customize this name

    # For the training set
    y_pred_train = pipeline.predict(X_train)
    f1_train = f1_score(y_train, y_pred_train, average='micro')
    f1_scores_train.append((pipeline_name, f1_train))

    # For the test set
    y_pred_test = pipeline.predict(X_test)
    f1_test = f1_score(y_test, y_pred_test, average='micro')
    f1_scores_test.append((pipeline_name, f1_test))

# Sort the F1-micro scores in descending order
f1_scores_train_sorted = sorted(f1_scores_train, key=lambda x: x[1], reverse=True)
f1_scores_test_sorted = sorted(f1_scores_test, key=lambda x: x[1], reverse=True)

# Function to print and plot the F1-micro scores
def display_and_plot(scores, title):
    print(title)
    for model, score in scores:
        print(f"Model: {model}, F1-micro Score: {score:.2f}")

    # Plot the bar graph
    models, values = zip(*scores)
    plt.bar(models, values)
    plt.xlabel('Model')
    plt.ylabel('F1-micro Score')
    plt.title(title)
    plt.xticks(rotation=45)
    plt.show()

# Display and plot for both training and test sets
display_and_plot(f1_scores_train_sorted, 'F1-micro Scores on Training Set (X_train, y_train) in Descending Order')
display_and_plot(f1_scores_test_sorted, 'F1-micro Scores on Test Set (X_test, y_test) in Descending Order')


## Comparision of the losses - brier_score_loss

In [None]:


from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import brier_score_loss

# Lists to store names and losses
names = []
losses = []

# Iterate through the pipelines and compute the loss
for pipeline, name in pipelines:
    try:
        # Make predictions on the test data
        y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
    except AttributeError:
        # Calibrate the model to provide probabilities, using the updated parameter name
        calibrated_clf = CalibratedClassifierCV(estimator=pipeline, method='sigmoid')
        calibrated_clf.fit(X_train, y_train)
        y_pred_proba = calibrated_clf.predict_proba(X_test)[:, 1]

    # Compute the Brier Score Loss
    loss = brier_score_loss(y_test, y_pred_proba)

    print(f"Brier Score Loss for {name}: {loss}")

    # Append to lists
    names.append(name)
    losses.append(loss)

# Create a DataFrame and sort by Brier Score Loss
import pandas as pd

results_df = pd.DataFrame({
    'Model': names,
    'Brier Score Loss': losses
})
results_df = results_df.sort_values(by='Brier Score Loss', ascending=True)

colors = ['red', 'green', 'blue', 'orange', 'purple', 'brown']

plt.bar(results_df['Model'], results_df['Brier Score Loss'], color=colors)
plt.ylabel('Brier Score Loss')
plt.title('Brier Score Loss for Different Models')
plt.xticks(rotation=45)
plt.show()




#### Take-away of the loss analysis:

> *logit* and *svc* have the lowest losses, meaning they perform better compared to the other models.

#### To identify models with the Highest and lowest TP,FP,TN,FN 

In [None]:
from sklearn.metrics import confusion_matrix

# Initialize variables to track highest and lowest TP, FP, FN, FP values
lowest_tp = lowest_fp = lowest_fn = lowest_tn = float('inf')
highest_tp = highest_fp = highest_fn = highest_tn = float('-inf')

# Lists to store results
highest_values = []
lowest_values = []
metrics = ['TP', 'FP', 'FN', 'TN']
models_highest = []
models_lowest = []

# Iterate through the pipelines
for pipeline, classifier_step in pipelines:
    pipeline_name = classifier_step 
    y_pred = pipeline.predict(X_test)

    cm = confusion_matrix(y_test, y_pred)
    tp, fp, fn, tn = cm.ravel()

    # Check for highest values
    if tp > highest_tp:
        highest_tp = tp
        highest_tp_model = pipeline_name
    if fp > highest_fp:
        highest_fp = fp
        highest_fp_model = pipeline_name
    if fn > highest_fn:
        highest_fn = fn
        highest_fn_model = pipeline_name
    if tn > highest_tn:
        highest_tn = tn
        highest_tn_model = pipeline_name

    # Check for lowest values
    if tp < lowest_tp:
        lowest_tp = tp
        lowest_tp_model = pipeline_name
    if fp < lowest_fp:
        lowest_fp = fp
        lowest_fp_model = pipeline_name
    if fn < lowest_fn:
        lowest_fn = fn
        lowest_fn_model = pipeline_name
    if tn < lowest_tn:
        lowest_tn = tn
        lowest_tn_model = pipeline_name

    print(f"Pipeline: {pipeline_name}")
    print(f"True Positives: {tp}, False Positives: {fp}, False Negatives: {fn}, True Negatives: {tn}")
    print("-------------------------------------------------------------")

# Store the highest and lowest values
highest_values = [highest_tp, highest_fp, highest_fn, highest_tn]
lowest_values = [lowest_tp, lowest_fp, lowest_fn, lowest_tn]
models_highest = [highest_tp_model, highest_fp_model, highest_fn_model, highest_tn_model]
models_lowest = [lowest_tp_model, lowest_fp_model, lowest_fn_model, lowest_tn_model]

# Function to plot the bar graph
def plot_bar_graph(values, models, title):
    plt.bar(metrics, values)
    for i, value in enumerate(values):
        plt.text(i, value / 2, models[i], ha='center', color='black')  # Color changed to black
    plt.xlabel('Metric')
    plt.ylabel('Value')
    plt.title(title)
    plt.show()

# Plot the graphs
plot_bar_graph(highest_values, models_highest, 'Models with Highest Values')
plot_bar_graph(lowest_values, models_lowest, 'Models with Lowest Values')


# Summary(without Hyper-parameter Tuning)

## Model summary without hyper-parameter tuning
 
> In terms of accuracy,the top performing models without hyper-parameter tuning are: **LogisticRegressor,Calibrated LinearSVC with 80%, Complement Naive Bayes** with an accuracy of **79%**

## Model Analysis
### Logistic Regression (logit):

> **Pros**: Balanced detection of both positive and negative sentiments.

> **Cons:** Potential misclassification may lead to inaccuracies in understanding customer sentiment, affecting decision-making in marketing or product development.

> **Business Implication:** Ideal for a general overview of customer sentiment. Suitable for applications like brand monitoring where both positive and negative sentiments are equally important.


### Naive Bayes (nb):

> **Pros**: High detection of negative sentiments (low False Negatives).

> **Cons:** The tendency to miss positive sentiments may lead to underestimation of customer satisfaction, potentially affecting strategies for brand promotion and loyalty programs.

> **Business Implication:** Useful when it's crucial to capture negative feedback, such as in quality control or customer service improvement.

### Stochastic Gradient Descent (sgd):

> **Pros:** Balanced detection of positive and negative sentiments.

> **Cons:** Moderate incorrect classifications could create challenges in precisely targeting customer segments or tailoring personalized marketing strategies.

> **Business Implication:** A versatile option that might require further tuning for specific use cases like targeted marketing or product enhancement.

### LightGBM (lgbm):

> **Pros:** Reasonable detection of positive sentiments.

> **Cons:** The potential misclassification of both positive and negative sentiments might lead to misguided business strategies, such as incorrect product improvements or inefficient allocation of resources.

> **Business Implication:** May need tuning for applications like assessing customer satisfaction or promoting positive reviews.

### Support Vector Classifier (svc):

> **Pros:** Highest detection of positive sentiments.

> **Cons:** Overlooking negative feedback might lead to missed opportunities for addressing customer grievances, potentially harming brand reputation or customer retention.

> **Business Implication:** Suitable for highlighting and leveraging positive feedback, such as in advertising or enhancing positive brand image.

### XGBoost (xgboost):

> **Pros:** Balanced detection of positive and negative sentiments.

> **Cons:** Some misclassifications may reduce the effectiveness of competitive analysis or market segmentation, leading to suboptimal business decisions.

> **Business Implication:** A flexible option that might need more tuning for applications like market segmentation or competitive analysis.


#### Decision:

**Best Model:** Both logit and svc are strong candidates.

> If the goal is to obtain a **balanced view of customer sentiments**, <code style="background:red;color:white">logit</code> might be the preferred choice.

> If the focus is on **leveraging positive feedback for marketing or brand enhancement**, <code style="background:red;color:white">svc</code> might be more suitable.

####  Business Considerations:
    The choice of model should align with the specific goals of the sentiment analysis:

- **Customer Service Improvement:** Focus on models that detect negative sentiments effectively (e.g., nb).

- **Brand Promotion:** Consider models that highlight positive sentiments (e.g., svc).

- **Overall Market Analysis:** Choose a model that provides a balanced view (e.g., logit).

In summary, the selection of the model should be closely tied to the business objectives of the sentiment analysis. Understanding the context, the importance of positive vs. negative sentiments, and the specific use case will guide the final decision. Collaboration with domain experts and further validation can also help in optimizing the model for the desired business outcome.

# Hyper-parameter Tuning

### Logistic Regression

In [None]:
# # Define the parameter grid
# param_grid_log = {
#     'preprocessor__tfidf__max_features': [1000, 5000], # 2 options
#     'preprocessor__tfidf__ngram_range': [(1, 1)], # 1 option
#     'logit__penalty': ['l1', 'l2'], # 2 options
#     'logit__C': [0.1, 1], # 2 options
#     'logit__fit_intercept': [True, False], # 2 options
#     'logit__solver': ['liblinear','saga'], # 1 option
#     'logit__class_weight': [None], # 1 option
#     'logit__max_iter': [100, 500] # 2 options
# }

# # Create the grid search object
# grid_search_log = GridSearchCV(pipe_log, param_grid_log, cv=5, verbose=1, scoring='f1') # You can change the scoring to any other appropriate metric

# # Fit the grid search on the training data
# grid_search_log.fit(X_train, y_train)


In [None]:
# from sklearn.model_selection import GridSearchCV

# # Now you can access the best_params_ and best_estimator_ attributes
# best_params = grid_search_log.best_params_
# best_estimator = grid_search_log.best_estimator_

In [None]:
# # Evaluate the best estimator on the test data
# score = best_estimator.score(X_test, y_test)
# score

In [None]:
#score=0.7901204227082821

In [None]:
# # Get the predictions for the test data
# y_pred = best_estimator.predict(X_test)

In [None]:
# # Print best hyperparameters and classification report
# print("Best Hyperparameters:", best_params)

In [None]:
#Best Hyperparameters: {'logit__C': 1, 'logit__class_weight': None, 'logit__fit_intercept': True, 'logit__max_iter': 100, 'logit__penalty': 'l1', 'logit__solver': 'liblinear', 'preprocessor__tfidf__max_features': 5000, 'preprocessor__tfidf__ngram_range': (1, 1)}

In [None]:
# report(X_test, y_test, grid_search_log)

#### Saving the hyper-tuned LogisticRegression model

In [None]:
# import pickle

# # Save the model to a file
# with open('best_logistic_model.pkl', 'wb') as f:
#     pickle.dump(grid_search_log, f)

# # You can also save the best parameters separately if needed
# with open('best_logistic_params.pkl', 'wb') as f:
#     pickle.dump(best_params, f)

# print("Model and parameters saved successfully!")

In [None]:
# # Load the model from the file
# with open('best_logistic_model.pkl', 'rb') as f:
#     loaded_model = pickle.load(f)

# # Load the best parameters from the file
# with open('best_logistic_params.pkl', 'rb') as f:
#     loaded_best_params = pickle.load(f)

# # You can now use loaded_model and loaded_best_params in your code
# print("Loaded Best Hyperparameters:", loaded_best_params)

### Hypertuning the SVC Model

In [None]:
# # Create a calibrated classifier from the LinearSVC
# calibrated_svc = CalibratedClassifierCV(base_estimator=pipe_svc)

# # Define the parameter grid
# param_grid_calibrated_svc = {
#     'base_estimator__preprocessor__tfidf__max_features': [1000, 3000], # Maximum number of features for TF-IDF
#     'base_estimator__preprocessor__tfidf__ngram_range': [(1, 1), (1, 2)], # Unigrams or bigrams
#     'base_estimator__svc__C': [0.1, 1, 10], # Regularization parameter
#     'base_estimator__svc__loss': ['hinge'], # Specifies the loss function
#     'base_estimator__svc__penalty': ['l2'], # Specifies the norm used in the penalization
#     'base_estimator__svc__fit_intercept': [True], # Specifies if a constant should be added to the decision function
#     'base_estimator__svc__max_iter': [1000], # Maximum number of iterations for the solvers to converge
#     'base_estimator__svc__class_weight': [None], # Weights associated with classes
#     'base_estimator__svc__multi_class': ['ovr'] # Determines the multi-class strategy
# }


# # Create the grid search object
# grid_calibrated_svc = GridSearchCV(calibrated_svc, param_grid_calibrated_svc, cv=5, verbose=1, scoring='f1') # You can change the scoring to any other appropriate metric

# # Fit the grid search on the training data
# grid_calibrated_svc.fit(X_train, y_train)


In [None]:
# print("Best parameters for Calibrated LinearSVC:", grid_calibrated_svc.best_params_)

In [None]:
# score = best_estimator.score(X_test, y_test)
# score

In [None]:
# score=0.7901204227082821

In [None]:
# # Evaluate the best Calibrated LinearSVC model
# report(X_test, y_test, grid_calibrated_svc)


#### Saving the hyper-tuned SVC model

In [None]:
# # Save the Calibrated LinearSVC model to a file
# with open('best_calibrated_svc_model.pkl', 'wb') as f:
#     pickle.dump(grid_calibrated_svc, f)

# # Save the best parameters of the Calibrated LinearSVC model to a file
# best_params_calibrated_svc = grid_calibrated_svc.best_params_
# with open('best_calibrated_svc_params.pkl', 'wb') as f:
#     pickle.dump(best_params_calibrated_svc, f)

# print("Calibrated LinearSVC model and parameters saved successfully!")


In [None]:
# # Load the Calibrated LinearSVC model from the file
# with open('best_calibrated_svc_model.pkl', 'rb') as f:
#     loaded_calibrated_svc_model = pickle.load(f)

# # Load the best parameters of the Calibrated LinearSVC model from the file
# with open('best_calibrated_svc_params.pkl', 'rb') as f:
#     loaded_best_calibrated_svc_params = pickle.load(f)

# # You can now use loaded_calibrated_svc_model and loaded_best_calibrated_svc_params in your code
# print("Loaded Best Hyperparameters for Calibrated LinearSVC:", loaded_best_calibrated_svc_params)


In [None]:
#Loaded Best Hyperparameters for Calibrated LinearSVC: {'base_estimator__preprocessor__tfidf__max_features': 3000, 'base_estimator__preprocessor__tfidf__ngram_range': (1, 1), 'base_estimator__svc__C': 1, 'base_estimator__svc__class_weight': None, 'base_estimator__svc__fit_intercept': True, 'base_estimator__svc__loss': 'hinge', 'base_estimator__svc__max_iter': 1000, 'base_estimator__svc__multi_class': 'ovr', 'base_estimator__svc__penalty': 'l2'}

# Summary
### Model summary after Hyper-parameter tuning a couple of models

> The scores went down for both **LogisticRegression** and **LinearSVC**.

> This might be due to several reasons, one being overfitting or choosing unfavourable parameters, further tuning of the parameters is required to arrive at the best set of parameters.

> However on looking at the PR curves for training and test data, it doesn't seem like overfitting is the cause, so an unfavourable set of parameters might be the most likely cause for a lower score after hyper-parameter tuning.


#### Making the submission.csv

In [None]:
def submission_csv(y_pred):
    # Map 1 to 'Positive' and 0 to 'Negative'
    sentiment_mapping = {1: 'POSITIVE', 0: 'NEGATIVE'}
    y_pred_labels = [sentiment_mapping[pred] for pred in y_pred]

    # Create a DataFrame with the predicted labels
    submission = pd.DataFrame({'id': range(len(y_pred_labels)), 'sentiment': y_pred_labels})

    # Save the DataFrame to the submission.csv file
    submission_file = "submission.csv"
    submission.to_csv(submission_file, index=False)  # Set index=False to remove the index column
    
    print(f'{submission_file} has been created successfully')
    print('Contents of the submission file:')
    #print(submission)
    return submission


# Create a DataFrame with the required columns
test_data_for_submission = merged_test_data[['reviewText', 'audienceScore', 'runtimeMinutes']]

# Make predictions on the actual test dataset for submission
y_pred_submission = pipe_log.predict(test_data_for_submission)

submission_csv(y_pred_submission)

In [None]:
sample_data

In [None]:
y_pred_submission.shape

___________________________________________________________________________________________________________________