In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
traindf = pd.read_csv("data/train.csv")
traindf.shape

(162758, 5)

In [7]:
testdf = pd.read_csv("data/test.csv")
testdf.shape

(55315, 4)

In [8]:
X_train = traindf["reviewText"]
X_train.fillna("neutral", inplace=True)
X_train

0         Henry Selick’s first movie since 2009’s Corali...
1         With a cast that reads like the Vogue Oscar pa...
2         Creed II does not give us anything but another...
3         I know what you're thinking, but this is no Li...
4         Director Fernando Meirelles tells the story wi...
                                ...                        
162753    A top-notch thriller with genuine surprises an...
162754    Some people find Derek Zoolander funny and lik...
162755    This fun, gentle comedy focuses mainly on them...
162756    The film is rescued by a strong third act, but...
162757            A peerless exercise in stimulus response.
Name: reviewText, Length: 162758, dtype: object

In [9]:
y_train = traindf['sentiment']
y_train

0         POSITIVE
1         NEGATIVE
2         POSITIVE
3         POSITIVE
4         POSITIVE
            ...   
162753    POSITIVE
162754    NEGATIVE
162755    POSITIVE
162756    NEGATIVE
162757    POSITIVE
Name: sentiment, Length: 162758, dtype: object

In [10]:
X_test = testdf["reviewText"]
X_test.fillna("neutral", inplace=True)
X_test

0        Green slowly cranks up the dread with style an...
1        Philip Noyce's direction is elegant and unforc...
2        It wouldn't do to say what path Maria ultimate...
3        Pig is not exactly the arthouse John Wick that...
4        An imaginative no-budget musical of sorts abou...
                               ...                        
55310    Ron Howard delivers an unconventional romantic...
55311    As an oddball art film that openly invites you...
55312    Nicholson wears his devilish grin from his fir...
55313    It's hard not be entertained by two dozen of C...
55314    Not clever enough for Smith fans, not gross en...
Name: reviewText, Length: 55315, dtype: object

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

## Define stop words  

In [12]:
stop_words = ["0o", "0s", "3a", "3b", "3d", "6b", "6o", "a", "a1", "a2", "a3", "a4", "ab", "able", "about", "above", "abst", "ac", "accordance", "according", "accordingly", "across", "act", "actually", "ad", "added", "adj", "ae", "af", "affected", "affecting", "affects", "after", "afterwards", "ag", "again", "against", "ah", "ain", "ain't", "aj", "al", "all", "allow", "allows", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "announce", "another", "any", "anybody", "anyhow", "anymore", "anyone", "anything", "anyway", "anyways", "anywhere", "ao", "ap", "apart", "apparently", "appear", "appreciate", "appropriate", "approximately", "ar", "are", "aren", "arent", "aren't", "arise", "around", "as", "a's", "aside", "ask", "asking", "associated", "at", "au", "auth", "av", "available", "aw", "away", "awfully", "ax", "ay", "az", "b", "b1", "b2", "b3", "ba", "back", "bc", "bd", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "begin", "beginning", "beginnings", "begins", "behind", "being", "believe", "below", "beside", "besides", "best", "better", "between", "beyond", "bi", "bill", "biol", "bj", "bk", "bl", "bn", "both", "bottom", "bp", "br", "brief", "briefly", "bs", "bt", "bu", "but", "bx", "by", "c", "c1", "c2", "c3", "ca", "call", "came", "can", "cannot", "cant", "can't", "cause", "causes", "cc", "cd", "ce", "certain", "certainly", "cf", "cg", "ch", "changes", "ci", "cit", "cj", "cl", "clearly", "cm", "c'mon", "cn", "co", "com", "come", "comes", "con", "concerning", "consequently", "consider", "considering", "contain", "containing", "contains", "corresponding", "could", "couldn", "couldnt", "couldn't", "course", "cp", "cq", "cr", "cry", "cs", "c's", "ct", "cu", "currently", "cv", "cx", "cy", "cz", "d", "d2", "da", "date", "dc", "dd", "de", "definitely", "describe", "described", "despite", "detail", "df", "di", "did", "didn", "didn't", "different", "dj", "dk", "dl", "do", "does", "doesn", "doesn't", "doing", "don", "done", "don't", "down", "downwards", "dp", "dr", "ds", "dt", "du", "due", "during", "dx", "dy", "e", "e2", "e3", "ea", "each", "ec", "ed", "edu", "ee", "ef", "effect", "eg", "ei", "eight", "eighty", "either", "ej", "el", "eleven", "else", "elsewhere", "em", "empty", "en", "end", "ending", "enough", "entirely", "eo", "ep", "eq", "er", "es", "especially", "est", "et", "et-al", "etc", "eu", "ev", "even", "ever", "every", "everybody", "everyone", "everything", "everywhere", "ex", "exactly", "example", "except", "ey", "f", "f2", "fa", "far", "fc", "few", "ff", "fi", "fifteen", "fifth", "fify", "fill", "find", "fire", "first", "five", "fix", "fj", "fl", "fn", "fo", "followed", "following", "follows", "for", "former", "formerly", "forth", "forty", "found", "four", "fr", "from", "front", "fs", "ft", "fu", "full", "further", "furthermore", "fy", "g", "ga", "gave", "ge", "get", "gets", "getting", "gi", "give", "given", "gives", "giving", "gj", "gl", "go", "goes", "going", "gone", "got", "gotten", "gr", "greetings", "gs", "gy", "h", "h2", "h3", "had", "hadn", "hadn't", "happens", "hardly", "has", "hasn", "hasnt", "hasn't", "have", "haven", "haven't", "having", "he", "hed", "he'd", "he'll", "hello", "help", "hence", "her", "here", "hereafter", "hereby", "herein", "heres", "here's", "hereupon", "hers", "herself", "hes", "he's", "hh", "hi", "hid", "him", "himself", "his", "hither", "hj", "ho", "home", "hopefully", "how", "howbeit", "however", "how's", "hr", "hs", "http", "hu", "hundred", "hy", "i", "i2", "i3", "i4", "i6", "i7", "i8", "ia", "ib", "ibid", "ic", "id", "i'd", "ie", "if", "ig", "ignored", "ih", "ii", "ij", "il", "i'll", "im", "i'm", "immediate", "immediately", "importance", "important", "in", "inasmuch", "inc", "indeed", "index", "indicate", "indicated", "indicates", "information", "inner", "insofar", "instead", "interest", "into", "invention", "inward", "io", "ip", "iq", "ir", "is", "isn", "isn't", "it", "itd", "it'd", "it'll", "its", "it's", "itself", "iv", "i've", "ix", "iy", "iz", "j", "jj", "jr", "js", "jt", "ju", "just", "k", "ke", "keep", "keeps", "kept", "kg", "kj", "km", "know", "known", "knows", "ko", "l", "l2", "la", "largely", "last", "lately", "later", "latter", "latterly", "lb", "lc", "le", "least", "les", "less", "lest", "let", "lets", "let's", "lf", "like", "liked", "likely", "line", "little", "lj", "ll", "ll", "ln", "lo", "look", "looking", "looks", "los", "lr", "ls", "lt", "ltd", "m", "m2", "ma", "made", "mainly", "make", "makes", "many", "may", "maybe", "me", "mean", "means", "meantime", "meanwhile", "merely", "mg", "might", "mightn", "mightn't", "mill", "million", "mine", "miss", "ml", "mn", "mo", "more", "moreover", "most", "mostly", "move", "mr", "mrs", "ms", "mt", "mu", "much", "mug", "must", "mustn", "mustn't", "my", "myself", "n", "n2", "na", "name", "namely", "nay", "nc", "nd", "ne", "near", "nearly", "necessarily", "necessary", "need", "needn", "needn't", "needs", "neither", "never", "nevertheless", "new", "next", "ng", "ni", "nine", "ninety", "nj", "nl", "nn", "no", "nobody", "non", "none", "nonetheless", "noone", "nor", "normally", "nos", "not", "noted", "nothing", "novel", "now", "nowhere", "nr", "ns", "nt", "ny", "o", "oa", "ob", "obtain", "obtained", "obviously", "oc", "od", "of", "off", "often", "og", "oh", "oi", "oj", "ok", "okay", "ol", "old", "om", "omitted", "on", "once", "one", "ones", "only", "onto", "oo", "op", "oq", "or", "ord", "os", "ot", "other", "others", "otherwise", "ou", "ought", "our", "ours", "ourselves", "out", "outside", "over", "overall", "ow", "owing", "own", "ox", "oz", "p", "p1", "p2", "p3", "page", "pagecount", "pages", "par", "part", "particular", "particularly", "pas", "past", "pc", "pd", "pe", "per", "perhaps", "pf", "ph", "pi", "pj", "pk", "pl", "placed", "please", "plus", "pm", "pn", "po", "poorly", "possible", "possibly", "potentially", "pp", "pq", "pr", "predominantly", "present", "presumably", "previously", "primarily", "probably", "promptly", "proud", "provides", "ps", "pt", "pu", "put", "py", "q", "qj", "qu", "que", "quickly", "quite", "qv", "r", "r2", "ra", "ran", "rather", "rc", "rd", "re", "readily", "really", "reasonably", "recent", "recently", "ref", "refs", "regarding", "regardless", "regards", "related", "relatively", "research", "research-articl", "respectively", "resulted", "resulting", "results", "rf", "rh", "ri", "right", "rj", "rl", "rm", "rn", "ro", "rq", "rr", "rs", "rt", "ru", "run", "rv", "ry", "s", "s2", "sa", "said", "same", "saw", "say", "saying", "says", "sc", "sd", "se", "sec", "second", "secondly", "section", "see", "seeing", "seem", "seemed", "seeming", "seems", "seen", "self", "selves", "sensible", "sent", "serious", "seriously", "seven", "several", "sf", "shall", "shan", "shan't", "she", "shed", "she'd", "she'll", "shes", "she's", "should", "shouldn", "shouldn't", "should've", "show", "showed", "shown", "showns", "shows", "si", "side", "significant", "significantly", "similar", "similarly", "since", "sincere", "six", "sixty", "sj", "sl", "slightly", "sm", "sn", "so", "some", "somebody", "somehow", "someone", "somethan", "something", "sometime", "sometimes", "somewhat", "somewhere", "soon", "sorry", "sp", "specifically", "specified", "specify", "specifying", "sq", "sr", "ss", "st", "still", "stop", "strongly", "sub", "substantially", "successfully", "such", "sufficiently", "suggest", "sup", "sure", "sy", "system", "sz", "t", "t1", "t2", "t3", "take", "taken", "taking", "tb", "tc", "td", "te", "tell", "ten", "tends", "tf", "th", "than", "thank", "thanks", "thanx", "that", "that'll", "thats", "that's", "that've", "the", "their", "theirs", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "thered", "therefore", "therein", "there'll", "thereof", "therere", "theres", "there's", "thereto", "thereupon", "there've", "these", "they", "theyd", "they'd", "they'll", "theyre", "they're", "they've", "thickv", "thin", "think", "third", "this", "thorough", "thoroughly", "those", "thou", "though", "thoughh", "thousand", "three", "throug", "through", "throughout", "thru", "thus", "ti", "til", "tip", "tj", "tl", "tm", "tn", "to", "together", "too", "took", "top", "toward", "towards", "tp", "tq", "tr", "tried", "tries", "truly", "try", "trying", "ts", "t's", "tt", "tv", "twelve", "twenty", "twice", "two", "tx", "u", "u201d", "ue", "ui", "uj", "uk", "um", "un", "under", "unfortunately", "unless", "unlike", "unlikely", "until", "unto", "uo", "up", "upon", "ups", "ur", "us", "use", "used", "useful", "usefully", "usefulness", "uses", "using", "usually", "ut", "v", "va", "value", "various", "vd", "ve", "ve", "very", "via", "viz", "vj", "vo", "vol", "vols", "volumtype", "vq", "vs", "vt", "vu", "w", "wa", "want", "wants", "was", "wasn", "wasnt", "wasn't", "way", "we", "wed", "we'd", "welcome", "well", "we'll", "well-b", "went", "were", "we're", "weren", "werent", "weren't", "we've", "what", "whatever", "what'll", "whats", "what's", "when", "whence", "whenever", "when's", "where", "whereafter", "whereas", "whereby", "wherein", "wheres", "where's", "whereupon", "wherever", "whether", "which", "while", "whim", "whither", "who", "whod", "whoever", "whole", "who'll", "whom", "whomever", "whos", "who's", "whose", "why", "why's", "wi", "widely", "will", "willing", "wish", "with", "within", "without", "wo", "won", "wonder", "wont", "won't", "words", "world", "would", "wouldn", "wouldnt", "wouldn't", "www", "x", "x1", "x2", "x3", "xf", "xi", "xj", "xk", "xl", "xn", "xo", "xs", "xt", "xv", "xx", "y", "y2", "yes", "yet", "yj", "yl", "you", "youd", "you'd", "you'll", "your", "youre", "you're", "yours", "yourself", "yourselves", "you've", "yr", "ys", "yt", "z", "zero", "zi", "zz"]
type(stop_words), len(stop_words)

(list, 1160)

## Model evaluation plan and code  

In [29]:
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import f1_score

In [30]:
def evaluate(y_test, y_pred):
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    print(f1_score(y_test, y_pred))
    ConfusionMatrixDisplay(y_test, y_pred)
    return

## Logistic Regression with TfidVectorizer for preprocessing  

In [13]:
logreg = LogisticRegression(max_iter=1000)
logreg

LogisticRegression(max_iter=1000)

In [14]:
tvec = TfidfVectorizer(stop_words=stop_words)
cvec = CountVectorizer(stop_words=stop_words)
tvec, cvec

(TfidfVectorizer(stop_words=['0o', '0s', '3a', '3b', '3d', '6b', '6o', 'a', 'a1',
                             'a2', 'a3', 'a4', 'ab', 'able', 'about', 'above',
                             'abst', 'ac', 'accordance', 'according',
                             'accordingly', 'across', 'act', 'actually', 'ad',
                             'added', 'adj', 'ae', 'af', 'affected', ...]),
 CountVectorizer(stop_words=['0o', '0s', '3a', '3b', '3d', '6b', '6o', 'a', 'a1',
                             'a2', 'a3', 'a4', 'ab', 'able', 'about', 'above',
                             'abst', 'ac', 'accordance', 'according',
                             'accordingly', 'across', 'act', 'actually', 'ad',
                             'added', 'adj', 'ae', 'af', 'affected', ...]))

In [15]:
pipe = Pipeline(steps=[
    ("vectorizer", tvec),
    ("LogisticRegression", logreg)
])
pipe

Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['0o', '0s', '3a', '3b', '3d', '6b',
                                             '6o', 'a', 'a1', 'a2', 'a3', 'a4',
                                             'ab', 'able', 'about', 'above',
                                             'abst', 'ac', 'accordance',
                                             'according', 'accordingly',
                                             'across', 'act', 'actually', 'ad',
                                             'added', 'adj', 'ae', 'af',
                                             'affected', ...])),
                ('LogisticRegression', LogisticRegression(max_iter=1000))])

In [16]:
pipe.fit(X_train, y_train)



Pipeline(steps=[('vectorizer',
                 TfidfVectorizer(stop_words=['0o', '0s', '3a', '3b', '3d', '6b',
                                             '6o', 'a', 'a1', 'a2', 'a3', 'a4',
                                             'ab', 'able', 'about', 'above',
                                             'abst', 'ac', 'accordance',
                                             'according', 'accordingly',
                                             'across', 'act', 'actually', 'ad',
                                             'added', 'adj', 'ae', 'af',
                                             'affected', ...])),
                ('LogisticRegression', LogisticRegression(max_iter=1000))])

In [17]:
y_pred = pipe.predict(X_test)
y_pred

array(['POSITIVE', 'POSITIVE', 'POSITIVE', ..., 'NEGATIVE', 'POSITIVE',
       'NEGATIVE'], dtype=object)

In [18]:
y_pred.shape

(55315,)

In [19]:
np.unique(y_pred, return_counts=True)

(array(['NEGATIVE', 'POSITIVE'], dtype=object),
 array([12884, 42431], dtype=int64))

In [20]:
def submit(y_pred):
    pred_df = pd.DataFrame(y_pred)
    pred_df.columns = ["sentiment"]
    pred_df.index.name = "id"
    pred_df.to_csv("submission.csv")
    return "Successfully created the submission file!!!"

In [21]:
submit(y_pred)

'Successfully created the submission file!!!'

In [31]:
# yp = pipe.predict(y_train)
# evaluate(y_train, yp)

              precision    recall  f1-score   support

    NEGATIVE       1.00      1.00      1.00     53997
    POSITIVE       1.00      1.00      1.00    108761

    accuracy                           1.00    162758
   macro avg       1.00      1.00      1.00    162758
weighted avg       1.00      1.00      1.00    162758

[[ 53997      0]
 [     0 108761]]


ValueError: pos_label=1 is not a valid label. It should be one of ['NEGATIVE', 'POSITIVE']

## Logistic Regression with CountVectorizer for preprocessing  

In [22]:
pipe_cvec = Pipeline(steps=[
    ("vectorizer", cvec),
    ("LogisticRegression", logreg)
])
pipe_cvec

Pipeline(steps=[('vectorizer',
                 CountVectorizer(stop_words=['0o', '0s', '3a', '3b', '3d', '6b',
                                             '6o', 'a', 'a1', 'a2', 'a3', 'a4',
                                             'ab', 'able', 'about', 'above',
                                             'abst', 'ac', 'accordance',
                                             'according', 'accordingly',
                                             'across', 'act', 'actually', 'ad',
                                             'added', 'adj', 'ae', 'af',
                                             'affected', ...])),
                ('LogisticRegression', LogisticRegression(max_iter=1000))])

In [23]:
# pipe_cvec.fit(X_train, y_train)

In [24]:
# y_pred_cvec = pipe_cvec.predict(X_test)
# y_pred_cvec

In [25]:
# y_pred_cvec.shape

In [26]:
# np.unique(y_pred_cvec, return_counts=True)

In [27]:
# submit(y_pred_cvec)