In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from itertools import compress

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE

import scipy.stats as stats
print("Imports done.")

Imports done.


### Custom functions  

In [2]:
def load_csv(filename: str):
    df = pd.read_csv(f"data/{filename}.csv")
    return df

In [3]:
def inspect(df: pd.DataFrame):
    print(f"Shape of the dataframe: {df.shape}")
    print()
    print(f"Columns in the dataframe:\n{df.columns}")
    print()
    print(f"{df.info()}")
    print()
    # print(f"Summary: {df.describe()}")
    print(f"Missing values:\n{df.isna().sum()}")
    return

In [4]:
def name_fl(name):
    l = name.split()
    n = ' '.join((l[0], l[-1]))
    return n

In [5]:
def select_features(df: pd.DataFrame, moviesdf: pd.DataFrame):
    '''
    This function merges the given dataframes. Note that the first df must be "train" or "test" and
    the second df should be "movies".
    Note: Sentiment column is present only in "train.csv" file and not "test.csv" file.
    '''
    
    # Drop duplicates from moviesdf
#     movies_unique = moviesdf.drop_duplicates(subset=["movieid"])
    # Drop duplicates using groupby - clubs similar rows and fills in missing values better
    movies_unique = moviesdf.fillna(value=np.nan).groupby("movieid").first()

    # Merge df and movies_unique
    df_merged = pd.merge(df, movies_unique, on="movieid", how='left')
    
    # Rename "isTopCritic" column, if it exists, to "isFrequentReviewer"
    df_merged.rename(columns={"isTopCritic": "isFrequentReviewer"}, inplace=True)
    
    # Drop columns
    df_merged = df_merged.drop(columns=["title", "ratingContents", "releaseDateTheaters", "releaseDateStreaming", "distributor", "soundType"])

    # Fill missing values in "reviewText" with empty string
    final = df_merged.copy()
    final["reviewYN"] = np.where(final["reviewText"].isnull(), 1, 0)    # Feature engineering - adding a new column
    final["reviewWC"] = final.apply(lambda x: len(str(x["reviewText"]).split()), axis=1)    # Feature engineering - adding second new column
    final["reviewText"] = final["reviewText"].fillna("neutral")
    
    # Fill missing values in "rating", "genre", original columns with the word "Unknown"
    final["rating"] = final["rating"].fillna("Unknown")
    final["genre"] = final["genre"].fillna("Unknown")
    final["originalLanguage"] = final["originalLanguage"].fillna("Unknown")

    # Impute missing values for "audienceScore" and "runtimeMinutes" columns
    final["audienceScore"] = final["audienceScore"].fillna(final["audienceScore"].mean())
    final["runtimeMinutes"] = final["runtimeMinutes"].fillna(final["runtimeMinutes"].median())
    
    # Preprocess and impute missing values in "boxOffice" column
    final["boxOffice"] = final["boxOffice"].str[1:]
    final["boxOffice"] = final["boxOffice"].replace(to_replace={"M": "*1000000", "K": "*1000"}, regex=True)
    final["boxOffice"] = final["boxOffice"].loc[final["boxOffice"].notnull()].apply(lambda x: eval(str(x)))
    final["boxOffice"] = final["boxOffice"].fillna(final["boxOffice"].median())
    # (Optional) Replace outliers in boxOffice with median
    median = final["boxOffice"].describe()['50%']
    iqr = final["boxOffice"].describe()['75%'] - final["boxOffice"].describe()['25%']
    ll = median - (1.5*iqr)
    ul = median + (1.5*iqr)
    final.loc[final["boxOffice"] > ul, "boxOffice"] = median
    
    # Clean language names
    final["originalLanguage"].replace({"English (United Kingdom)": "English", 
                                            "English (Australia)" : "English",
                                            "French (France)": "French", 
                                            "French (Canada)": "French",
                                            "Portuguese (Brazil)": "Portuguese",
                                            "Spanish (Spain)": "Spanish"},                                         
                                            inplace=True)
    
    # Clean reviewerName column
    pre_post_fixes = {"Mr. ": "", "Mrs. ": "", "Ms. ": "", "Dr. ": "", 
                      " MD": "", " DDS": "", " DVM": "", " Jr.": "", " PhD": "", " II": "", " IV": ""}
    final["reviewerName"] = final["reviewerName"].replace(pre_post_fixes, regex=True)
    final["reviewerName"] = final["reviewerName"].apply(name_fl)

    return final

In [6]:
merged = select_features(load_csv("train"), load_csv("movies"))
inspect(merged)

Shape of the dataframe: (162758, 14)

Columns in the dataframe:
Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'sentiment', 'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director', 'boxOffice', 'reviewYN', 'reviewWC'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 162758 entries, 0 to 162757
Data columns (total 14 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   movieid             162758 non-null  object 
 1   reviewerName        162758 non-null  object 
 2   isFrequentReviewer  162758 non-null  bool   
 3   reviewText          162758 non-null  object 
 4   sentiment           162758 non-null  object 
 5   audienceScore       162758 non-null  float64
 6   rating              162758 non-null  object 
 7   runtimeMinutes      162758 non-null  float64
 8   genre               162758 non-null  object 
 9   originalLanguage    162758

In [7]:
merged_test = select_features(load_csv("test"), load_csv("movies"))
inspect(merged_test)

Shape of the dataframe: (55315, 13)

Columns in the dataframe:
Index(['movieid', 'reviewerName', 'isFrequentReviewer', 'reviewText',
       'audienceScore', 'rating', 'runtimeMinutes', 'genre',
       'originalLanguage', 'director', 'boxOffice', 'reviewYN', 'reviewWC'],
      dtype='object')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 55315 entries, 0 to 55314
Data columns (total 13 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movieid             55315 non-null  object 
 1   reviewerName        55315 non-null  object 
 2   isFrequentReviewer  55315 non-null  bool   
 3   reviewText          55315 non-null  object 
 4   audienceScore       55315 non-null  float64
 5   rating              55315 non-null  object 
 6   runtimeMinutes      55315 non-null  float64
 7   genre               55315 non-null  object 
 8   originalLanguage    55315 non-null  object 
 9   director            55315 non-null  object 
 10  boxO

## Work on vocabulary  

In [8]:
rt_senti = merged[["reviewText", "sentiment"]].copy()
rt_senti.shape, rt_senti.head()

((162758, 2),
                                           reviewText sentiment
 0  Henry Selick’s first movie since 2009’s Corali...  POSITIVE
 1  With a cast that reads like the Vogue Oscar pa...  NEGATIVE
 2  Creed II does not give us anything but another...  POSITIVE
 3  I know what you're thinking, but this is no Li...  POSITIVE
 4  Director Fernando Meirelles tells the story wi...  POSITIVE)

In [9]:
rt_test = pd.DataFrame(merged_test["reviewText"].copy())
rt_test.shape, rt_test.head()

((55315, 1),
                                           reviewText
 0  Green slowly cranks up the dread with style an...
 1  Philip Noyce's direction is elegant and unforc...
 2  It wouldn't do to say what path Maria ultimate...
 3  Pig is not exactly the arthouse John Wick that...
 4  An imaginative no-budget musical of sorts abou...)

In [10]:
def get_vocab(text_df, ngram_range=(1,1)):
    print(f"Generating vocabulary for ngram_range: {ngram_range}...")
    tvec1 = TfidfVectorizer(ngram_range=ngram_range, stop_words='english')
    tvec1.fit(text_df)
    voc_ngram = list(tvec1.vocabulary_.keys())
    return voc_ngram

In [11]:
def get_common_vocab(df_list: list, ngram_range_list: list):
    """
    Get common vocabulary from a list of dataframes.
    """
    # Get vocabulary from each dataframe for a given ngram-range
    for ngram_range in ngram_range_list:
        vocab_ngram = set()
        for df in df_list:
            vocab_ngram_df = set(get_vocab(df['reviewText'], ngram_range))
            print(f"Vocabulary for ngram-range {ngram_range} in df: {len(vocab_ngram_df)}")
            if vocab_ngram:
                vocab_ngram = vocab_ngram.intersection(vocab_ngram_df)
            else:
                vocab_ngram = vocab_ngram_df
        print(f"Common vocabulary for ngram-range {ngram_range}: {len(vocab_ngram)}")   
    return vocab_ngram

In [12]:
common_unigrams = get_common_vocab([rt_senti, rt_test], ngram_range_list=[(1,1)])
common_unigrams

Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 64912
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 43588
Common vocabulary for ngram-range (1, 1): 35946


{'jthj',
 'manhood',
 'rejected',
 'petra',
 'manga',
 'fleet',
 'radicalized',
 'glories',
 'headiness',
 'compounded',
 'turistas',
 'marginalized',
 'twinkie',
 'raves',
 'hip',
 'persist',
 'flutter',
 'brannaman',
 'ill',
 'contented',
 'transcendently',
 'topple',
 'norah',
 'defuses',
 'grumbling',
 'imploding',
 'tide',
 'fruitless',
 'transformative',
 'anarchic',
 'intruders',
 'adventureland',
 'standard',
 'imediatamente',
 'populace',
 'wade',
 'nego',
 'overlapping',
 'voyeurs',
 'interrupters',
 'centrally',
 'ugly',
 'paralysed',
 'writings',
 'advocate',
 'magical',
 'sams',
 'dwayne',
 'wrinkly',
 'reveled',
 'onscreen',
 'dazzles',
 'braving',
 'unfriend',
 'board',
 'iowa',
 'restless',
 'regaling',
 'complexity',
 'superpower',
 'zohan',
 'climb',
 'haven',
 'dynevor',
 'creepingly',
 'insistence',
 'odious',
 'chong',
 'drummond',
 'quickened',
 'lathan',
 'paints',
 'masked',
 'goodness',
 'grasps',
 'diablo',
 'yore',
 'suite',
 'visible',
 'flouting',
 'requiem

In [13]:
common_bigrams = get_common_vocab([rt_senti, rt_test], ngram_range_list=[(2,2)])
common_bigrams

Generating vocabulary for ngram_range: (2, 2)...
Vocabulary for ngram-range (2, 2) in df: 1146211
Generating vocabulary for ngram_range: (2, 2)...
Vocabulary for ngram-range (2, 2) in df: 442876
Common vocabulary for ngram-range (2, 2): 115420


{'lacks magic',
 'self effacing',
 'blanchett let',
 'home run',
 'smart self',
 'brief moment',
 'movie mayhem',
 'situations movie',
 'candid interviews',
 'fractured family',
 'performances lovely',
 'morbid curiosity',
 'story death',
 '8217 ending',
 'pointed wrong',
 'interesting intelligent',
 'fact explores',
 'cinematic excess',
 'live wire',
 'physical comedy',
 'kick head',
 'dark farce',
 'results eye',
 'talents world',
 'silliness 91',
 'paul rachman',
 'u2 3d',
 'heralding arrival',
 'ticks away',
 'simply lets',
 'enjoy latest',
 'men women',
 'feels 20',
 'movie holding',
 'musical energy',
 'story suggests',
 'life poverty',
 'luck 44',
 'turns bust',
 'path movie',
 'cold case',
 'movie roots',
 'display thriller',
 'humanist comedy',
 'pretty badly',
 'movies guys',
 'quot humanity',
 'unusual intelligence',
 'story audience',
 'stunning shots',
 'romance happens',
 'disappointing given',
 'purely enjoyable',
 'pulls heartstrings',
 'refusal pass',
 'paul bettany',


In [14]:
common_trigrams = get_common_vocab([rt_senti, rt_test], ngram_range_list=[(3,3)])
common_trigrams

Generating vocabulary for ngram_range: (3, 3)...
Vocabulary for ngram-range (3, 3) in df: 1443939
Generating vocabulary for ngram_range: (3, 3)...
Vocabulary for ngram-range (3, 3) in df: 498850
Common vocabulary for ngram-range (3, 3): 23768


{'deeper emotional connection',
 'les vampires thrill',
 'film completely different',
 'does just opposite',
 'age story boy',
 'world just little',
 'rewarding 46 46',
 'know thought ask',
 'fans earlier films',
 'conduit expose pockets',
 'old fashioned political',
 'feels like songs',
 'writer director assayas',
 'cheer gentlemanly honor',
 'amounts 90 minutes',
 'unimpressive animation service',
 'fans elated discover',
 'video game series',
 'little known facet',
 'does little make',
 'day review spanish',
 'catherine deneuve gerard',
 'felt like real',
 'entertaining movie right',
 'hope doesn happen',
 'final year obama',
 'janis joplin film',
 'asking big questions',
 'world building high',
 'ingmar bergman film',
 'appeal wide audience',
 'cliché didn like',
 'harry potter half',
 'uneven piece work',
 'mixture slapstick sentiment',
 'film does work',
 'raat akeli hai',
 'best runner snappy',
 'feeling current drought',
 'bullock gives best',
 'john carter big',
 'like underdo

`Get common words within training text corpus`  

In [44]:
df_shuffled = rt_senti.sample(frac=1)
df_splits = np.array_split(df_shuffled, 5)
# for df in df_splits:
#     print(df.shape)
#     display(df)

In [45]:
common_unigrams_train = get_common_vocab(df_splits, ngram_range_list=[(1,1)])
common_unigrams_train

Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34666
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34342
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34678
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34567
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34642
Common vocabulary for ngram-range (1, 1): 16756


{'almereyda',
 'happytime',
 'manhood',
 'rejected',
 'silliest',
 'thurber',
 'foreboding',
 'manga',
 'arnaud',
 'taste',
 'fleet',
 'unravelling',
 'willie',
 'brutality',
 'karloff',
 'glories',
 'participating',
 'pathos',
 'compounded',
 'momentarily',
 'sitting',
 'blart',
 'marginalized',
 'teamwork',
 'hip',
 'devotes',
 'descriptions',
 'ill',
 'fossil',
 'heralds',
 'drab',
 'sign',
 'jim',
 '1974',
 'ha',
 'tide',
 'scratch',
 'goldthwait',
 'unparalleled',
 'unevenly',
 'stood',
 'transformative',
 'anarchic',
 'mundane',
 'adventureland',
 'intervals',
 'amuse',
 '24',
 'traumatized',
 'spoilers',
 'standard',
 'wade',
 'incoherence',
 'peyton',
 'hounds',
 'institution',
 'hokey',
 'lapse',
 'funding',
 'ugly',
 'favreau',
 'bromance',
 'archival',
 'simpsons',
 'embarrassed',
 'wayward',
 'danielle',
 'weighed',
 'advocate',
 'magical',
 'overweight',
 'dwayne',
 'kung',
 'cynics',
 'undertaking',
 'slipped',
 'onscreen',
 'dazzles',
 'board',
 'property',
 'hunt',
 'ha

In [57]:
sorted(list(common_unigrams_train))[:212]

['000',
 '007',
 '10',
 '100',
 '101',
 '105',
 '106',
 '108',
 '10th',
 '11',
 '1138',
 '12',
 '13',
 '133',
 '13th',
 '14',
 '140',
 '1408',
 '15',
 '150',
 '151',
 '157',
 '16',
 '160',
 '17',
 '17th',
 '18',
 '18th',
 '19',
 '1917',
 '1920s',
 '1930s',
 '1933',
 '1939',
 '1940',
 '1940s',
 '1941',
 '1942',
 '1948',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1954',
 '1955',
 '1956',
 '1960',
 '1960s',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981',
 '1982',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '200',
 '2000',
 '2000s',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2010s',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',


In [50]:
sorted(list(common_unigrams_train))[100:300]

['2007',
 '2008',
 '2009',
 '2010',
 '2010s',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017',
 '2018',
 '2019',
 '2020',
 '2021',
 '2022',
 '2023',
 '20s',
 '20th',
 '21',
 '21st',
 '22',
 '225',
 '23',
 '233',
 '24',
 '25',
 '25th',
 '26',
 '27',
 '28',
 '29',
 '2nd',
 '2u',
 '30',
 '300',
 '3000',
 '30s',
 '32',
 '33',
 '35',
 '35mm',
 '36',
 '360',
 '37',
 '38',
 '3d',
 '40',
 '400',
 '40s',
 '41',
 '42',
 '43',
 '44',
 '45',
 '46',
 '47',
 '48',
 '49',
 '50',
 '500',
 '50s',
 '51',
 '56',
 '58',
 '59',
 '60',
 '60s',
 '63',
 '65',
 '65279',
 '66',
 '666',
 '70',
 '70s',
 '71',
 '72',
 '75',
 '76',
 '77',
 '78',
 '80',
 '80s',
 '81',
 '82',
 '8211',
 '8212',
 '8216',
 '8217',
 '8220',
 '8221',
 '8230',
 '8232',
 '83',
 '84',
 '85',
 '86',
 '87',
 '88',
 '89',
 '90',
 '90s',
 '91',
 '93',
 '94',
 '95',
 '96',
 '97',
 '98',
 '99',
 'a24',
 'aardman',
 'aaron',
 'aback',
 'abandon',
 'abandoned',
 'abandonment',
 'abandons',
 'abba',
 'abbas',
 'abbey',
 'abc',
 'abducti

## Common vocabulary in train that is also in test  

In [48]:
len(common_unigrams_train & common_unigrams)

16441

In [49]:
sorted(list(common_unigrams_train & common_unigrams))[:100]

['000',
 '007',
 '10',
 '100',
 '101',
 '105',
 '106',
 '108',
 '10th',
 '11',
 '1138',
 '12',
 '13',
 '133',
 '13th',
 '14',
 '140',
 '1408',
 '15',
 '150',
 '151',
 '157',
 '16',
 '160',
 '17',
 '17th',
 '18',
 '18th',
 '19',
 '1917',
 '1920s',
 '1930s',
 '1933',
 '1939',
 '1940',
 '1940s',
 '1941',
 '1942',
 '1948',
 '1950',
 '1950s',
 '1952',
 '1953',
 '1954',
 '1955',
 '1956',
 '1960',
 '1960s',
 '1961',
 '1962',
 '1963',
 '1964',
 '1965',
 '1966',
 '1967',
 '1968',
 '1969',
 '1970',
 '1970s',
 '1971',
 '1972',
 '1973',
 '1974',
 '1975',
 '1976',
 '1977',
 '1978',
 '1979',
 '1980',
 '1980s',
 '1981',
 '1982',
 '1984',
 '1985',
 '1986',
 '1987',
 '1988',
 '1989',
 '1990',
 '1990s',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '19th',
 '20',
 '200',
 '2000',
 '2000s',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006']

## Vocabulary after replacing numbers with empty string in reviewText column  

In [60]:
# Replace numbers with empty string in reviewText
rt_tr_pure_str = rt_senti.copy()
rt_tr_pure_str["reviewText"] = rt_senti["reviewText"].str.replace('\d+', '', regex=True)
rt_tr_pure_str

Unnamed: 0,reviewText,sentiment
0,Henry Selick’s first movie since ’s Coraline. ...,POSITIVE
1,With a cast that reads like the Vogue Oscar pa...,NEGATIVE
2,Creed II does not give us anything but another...,POSITIVE
3,"I know what you're thinking, but this is no Li...",POSITIVE
4,Director Fernando Meirelles tells the story wi...,POSITIVE
...,...,...
162753,A top-notch thriller with genuine surprises an...,POSITIVE
162754,Some people find Derek Zoolander funny and lik...,NEGATIVE
162755,"This fun, gentle comedy focuses mainly on them...",POSITIVE
162756,"The film is rescued by a strong third act, but...",NEGATIVE


In [61]:
# Replace numbers with empty string in reviewText
rt_test_pure_str = rt_test.copy()
rt_test_pure_str["reviewText"] = rt_test["reviewText"].str.replace('\d+', '', regex=True)
rt_test_pure_str

Unnamed: 0,reviewText
0,Green slowly cranks up the dread with style an...
1,Philip Noyce's direction is elegant and unforc...
2,It wouldn't do to say what path Maria ultimate...
3,Pig is not exactly the arthouse John Wick that...
4,An imaginative no-budget musical of sorts abou...
...,...
55310,Ron Howard delivers an unconventional romantic...
55311,As an oddball art film that openly invites you...
55312,Nicholson wears his devilish grin from his fir...
55313,It's hard not be entertained by two dozen of C...


In [68]:
df_shuffled_pure_str = rt_tr_pure_str.sample(frac=1)
df_splits_pure_str = np.array_split(df_shuffled_pure_str, 5)
# for df in df_splits:
#     print(df.shape)
#     display(df)

In [69]:
common_unigrams_train_pure_str = get_common_vocab(df_splits_pure_str, ngram_range_list=[(1,1)])
common_unigrams_train_pure_str

Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34338
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34178
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 33881
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34060
Generating vocabulary for ngram_range: (1, 1)...
Vocabulary for ngram-range (1, 1) in df: 34210
Common vocabulary for ngram-range (1, 1): 16431


{'almereyda',
 'happytime',
 'rejected',
 'manhood',
 'silliest',
 'arnaud',
 'foreboding',
 'manga',
 'petra',
 'taste',
 'fleet',
 'brutality',
 'karloff',
 'glories',
 'pathos',
 'compounded',
 'sitting',
 'blart',
 'marginalized',
 'teamwork',
 'hip',
 'devotes',
 'enraged',
 'ill',
 'heralds',
 'drab',
 'sign',
 'jim',
 'ha',
 'tide',
 'inserted',
 'scratch',
 'goldthwait',
 'unevenly',
 'unparalleled',
 'stood',
 'transformative',
 'anarchic',
 'mundane',
 'adventureland',
 'intervals',
 'amuse',
 'spoilers',
 'standard',
 'foolishly',
 'wade',
 'peyton',
 'incoherence',
 'institution',
 'hokey',
 'lapse',
 'patriots',
 'ugly',
 'favreau',
 'bromance',
 'archival',
 'simpsons',
 'embarrassed',
 'wayward',
 'danielle',
 'interrogates',
 'weighed',
 'advocate',
 'magical',
 'dwayne',
 'kung',
 'cynics',
 'undertaking',
 'authors',
 'slipped',
 'dazzles',
 'onscreen',
 'board',
 'property',
 'hunt',
 'handles',
 'bacall',
 'restless',
 'casanova',
 'refreshing',
 'dance',
 'naturali

In [71]:
sorted(list(common_unigrams_train_pure_str))[:210]

['aardman',
 'aaron',
 'abandon',
 'abandoned',
 'abandoning',
 'abandonment',
 'abandons',
 'abbey',
 'abbott',
 'abduction',
 'abhorrent',
 'abiding',
 'abilities',
 'ability',
 'abject',
 'able',
 'ably',
 'aboard',
 'abominable',
 'abomination',
 'abortion',
 'abound',
 'abounds',
 'abraham',
 'abrahamson',
 'abrams',
 'abrasive',
 'abroad',
 'abrupt',
 'abruptly',
 'absence',
 'absent',
 'absolute',
 'absolutely',
 'absorb',
 'absorbed',
 'absorbing',
 'absorbs',
 'absorption',
 'abstract',
 'abstraction',
 'absurd',
 'absurdist',
 'absurdities',
 'absurdity',
 'absurdly',
 'abu',
 'abundance',
 'abundant',
 'abuse',
 'abused',
 'abusive',
 'abysmal',
 'abyss',
 'academic',
 'academy',
 'accent',
 'accents',
 'accentuate',
 'accept',
 'acceptable',
 'acceptance',
 'accepted',
 'accepting',
 'accepts',
 'access',
 'accessible',
 'accident',
 'accidental',
 'accidentally',
 'acclaim',
 'acclaimed',
 'accolades',
 'accommodate',
 'accompanied',
 'accompanies',
 'accompaniment',
 'acc

In [95]:
# Find rows containing numbers in rt_tr_pure_str
print(rt_tr_pure_str.shape)
dfff = rt_tr_pure_str["reviewText"][rt_tr_pure_str["reviewText"].str.contains(r'\d', case=False, regex=True)]
dfff.shape, dfff.head()
# dfff.to_csv("dfff.csv", index=False)

(162758, 2)
