### Hypothesis 2: NLP Critic's Reviews

In [None]:
%%time

df_nlp2 = df[["Critic_Review", "Won_Oscars"]]
# Assuming you have your features and target variables
X = df_nlp2['Critic_Review']  # Features
y = df_nlp2['Won_Oscars']  # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

def remove_accented_chars(text):
  text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
  return text

def pre_process_corpus(docs):
  norm_docs = []
  for doc in tqdm.tqdm(docs):
    # remove extra newlines and tabs \ spaces
    doc = doc.translate(doc.maketrans("\n\t\r", "   "))
    # lowercase the text
    doc = doc.lower()
    # remove accented characters from text => convert to plain english
    doc = remove_accented_chars(doc)
    # expand contractions i.e. won't => would not
    doc = contractions.fix(doc)

    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, flags=re.I|re.A)
    doc = re.sub(' +', ' ', doc) # remove extra spaces between words
    doc = doc.strip()

    norm_docs.append(doc)

  return norm_docs

norm_train_reviews = pre_process_corpus(X_train)
norm_test_reviews = pre_process_corpus(X_test)

(680,) (171,) (680,) (171,)


100%|██████████| 680/680 [00:02<00:00, 254.22it/s]
100%|██████████| 171/171 [00:00<00:00, 281.69it/s]

CPU times: user 3.22 s, sys: 45.3 ms, total: 3.27 s
Wall time: 3.34 s





In [None]:
%%time

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# build BOW features on train reviews
cv = CountVectorizer(binary=False, min_df=5, max_df=1.0, ngram_range=(1,2))
# min_df=5, max_df=1.0 -> remove words occuring in < 5 documents (very rare words), keep words even if occuring in 100% (all) of docs
# ngram_range=(1,2) -> words and bigrams as features
cv_train_features = cv.fit_transform(norm_train_reviews)

# build TFIDF features on train reviews
tv = TfidfVectorizer(use_idf=True, min_df=5, max_df=1.0, ngram_range=(1,2),
                     sublinear_tf=True)
tv_train_features = tv.fit_transform(norm_train_reviews)

# transform test reviews into features
# we use .transform(..) and NOT .fit_transform(..)
# to use vocabulary learnt during training data as our features
cv_test_features = cv.transform(norm_test_reviews)

tv_test_features = tv.transform(norm_test_reviews)

print('BOW model:> Train features shape:', cv_train_features.shape, ' Test features shape:', cv_test_features.shape)
print('TFIDF model:> Train features shape:', tv_train_features.shape, ' Test features shape:', tv_test_features.shape)

BOW model:> Train features shape: (680, 91251)  Test features shape: (171, 91251)
TFIDF model:> Train features shape: (680, 91251)  Test features shape: (171, 91251)
CPU times: user 8.76 s, sys: 199 ms, total: 8.96 s
Wall time: 9.14 s


In [None]:
%%time

# Random Forest model on BOW features
from sklearn.ensemble import RandomForestClassifier

# instantiate model
rf = RandomForestClassifier(
    n_estimators=140,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    class_weight='balanced',
    random_state=15)

# train model
rf.fit(cv_train_features, y_train)

# predict on test data
rf_bow_predictions = rf.predict(cv_test_features)

labels = ['negative', 'positive']
print(classification_report(y_test, rf_bow_predictions))
pd.DataFrame(confusion_matrix(y_test, rf_bow_predictions), index=labels, columns=labels)

              precision    recall  f1-score   support

       False       0.78      0.95      0.86       127
        True       0.62      0.23      0.33        44

    accuracy                           0.77       171
   macro avg       0.70      0.59      0.60       171
weighted avg       0.74      0.77      0.72       171

CPU times: user 936 ms, sys: 18.6 ms, total: 955 ms
Wall time: 985 ms


Unnamed: 0,negative,positive
negative,121,6
positive,34,10
