# Applied Data Analysis Project
**Team**: ToeStewBrr - Alexander Sternfeld, Marguerite Thery, Antoine Bonnet, Hugo Bordereaux

**Dataset**: CMU Movie Summary Corpus

## Textual Analysis

We first load data files and download the pre-processed plot summaries dataframe.

In [None]:
from load_data import *
from coreNLP_analysis import *
from textual_analysis import *
import spacy

download_data()
plot_df = load_plot_df()
movie_df = load_movie_df()

### 1. Remove Stop Words

A stop word is a frequently used term that a search engine has been configured to ignore, both while indexing entries for searching and when retrieving them as the result of a search query. Examples of stop words include "the," "a," "an," and "in."
We don't want these terms to take up any unnecessary storage space or processing time in our database. By keeping a record of the terms you believe to be stop words, we may easily eliminate them for this reason.

In [None]:
#copy the plot_df to a new dataframe
plot_df_removed = plot_df.copy()
#Remove stopwords from the summaries
plot_df_removed['Summary'] = plot_df['Summary'].apply(remove_stopwords)



In [None]:
plot_df_removed.tail()

In [None]:
nlp = spacy.load("en_core_web_lg")
words = nlp("love")

#Create a column with the similarity of the summaries to each word in words
for word in words:
        #add empty column
        plot_df_removed[word.text] = np.nan
        #filling it with the corresponding similarity score
        plot_df_removed[word.text] = plot_df_removed['Summary'].apply(lambda x: nlp(' '.join(x)).similarity(words))

In [None]:
plot_df_removed.head()

In [None]:
#sort the dataframe by the similarity score
plot_df_removed.sort_values(by='love', ascending=False, inplace=True)
plot_df_removed.head()

In [None]:
#extract love-related words from the summary
def extract_love_words(text):
    words = nlp("love")
    love_words = []
    for word in words:
        love_words += [token.text for token in nlp(' '.join(text)) if token.similarity(word) > 0.35]
    return love_words

#Create a column with the love-related words in the summaries
plot_df_removed['love_words'] = np.nan
plot_df_removed['love_words'][:10] = plot_df_removed['Summary'][:10].apply(extract_love_words)

#sort love-related words by similarity to love
words = nlp("love")
for word in words:
    plot_df_removed['love_words'][:10] = plot_df_removed['love_words'][:10].apply(lambda x: sorted(x, key=lambda y: nlp(y).similarity(words)))

plot_df_removed.head()

In [None]:
test_words = [('wedding', 1), ('valentine', 1), ('girlfriend',1), ('going out',1), ('hate',0), ('cash',0), ('beautiful',1), ('ugly',0), ('mushroom',0), ('glass',0), ('phone',0), ('bank',0), ('partner',1), ('admiration',1),
('dinner',0), ('union',1), ('tender',1) , ('vehicule',0), ('computer',0), ('safety',0), ('kiss',1), ('fun',0), ('nerves',0), ('aggressive',0), ('jealous',1), ('sober',0), ('forgive',0), ('daughter',0), ('punishment',0),
('relation',1), ('date',1), ('perfume',0), ('affectionate',1), ('friend',0), ('jewels',0), ('commitment',1), ('passion',1)]

#computes similarity score with love for each word in test_words
for word in test_words:
    print(word[0], nlp(word[0]).similarity(nlp("love")))

In [None]:
score_threshold = 0.35

#create a dataframe from the array of love-related words
love_words = pd.DataFrame(columns=['word', 'love_related'])
love_words['word'],love_words['love_related'] = zip(*test_words)
#add a column with the similarity score with love
love_words['score'] = love_words['word'].apply(lambda x: nlp(x).similarity(nlp("love valentine wedding girlfriend")))
#add a column with 1 if the score is above the threshold, 0 otherwise
love_words['above_threshold'] = love_words['score'].apply(lambda x: 1 if x > score_threshold else 0)
love_words




In [None]:
def compute_f1_score(Truth, Prediction):
    TP = Truth.apply(lambda x: 1 if x == 1 and Prediction[x] == 1 else 0).sum()
    FP = Truth.apply(lambda x: 1 if x == 0 and Prediction[x] == 1 else 0).sum()
    FN = Truth.apply(lambda x: 1 if x == 1 and Prediction[x] == 0 else 0).sum()
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1_score = 2*precision*recall/(precision+recall)
    return f1_score

In [None]:
#compute F1 score of the model
compute_f1_score(love_words['love_related'], love_words['above_threshold'])

#maximize F1 score by varying the threshold
max_f1 = 0
max_threshold = 0
for i in range(100):
    score_threshold = i/100
    f1 = compute_f1_score(love_words['love_related'], love_words['above_threshold'])
    if f1 > max_f1:
        max_f1 = f1
        max_threshold = score_threshold

print('optimal threshold is', max_threshold)

love_words['above_threshold'] = love_words['score'].apply(lambda x: 1 if x > max_threshold else 0)
love_words.head(20)

In [None]:
#merge plot_df_removed with movie_df
plot_genre_df = plot_df_removed.merge(movie_df, on='Wikipedia ID')



In [None]:
#Add a column with a boolean value indicating if the movie is a love movie
romance_genres = ['Romantic comedy', 'Romance Film', 'Romantic drama', 'Romantic fantasy', 'Romantic thriller']
is_romantic = lambda i: lambda x: any(y in romance_genres[i] for y in x) if type(x) == list else False
plot_genre_df.head()
plot_genre_df["Romantic"] = plot_genre_df['Genres'].apply(is_romantic(slice(0, 5)))
plot_genre_df.head()