In [1]:
# TODO: Remove this before submission
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [2]:
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
import re
from bs4 import BeautifulSoup


[nltk_data] Downloading package wordnet to /Users/aditya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz



In [4]:
DATA_PATH = "./data"
DATA_FILE = "amazon_reviews_us_Jewelry_v1_00.tsv"

DATA_COL = "review_body"
TARGET_COL = "star_rating"

RANDOM_SEED = 42

## Read Data


## Keep Reviews and Ratings


In [5]:
# Load the tab separated data file, and print the first 5 rows for confirmation
data = pd.read_csv(f"{DATA_PATH}/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=[TARGET_COL, DATA_COL])
data.head()


  data = pd.read_csv(f"{DATA_PATH}/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=[TARGET_COL, DATA_COL])


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


Understanding Data


In [6]:
data.describe()


Unnamed: 0,star_rating,review_body
count,1767042,1766807
unique,11,1618522
top,5,Love it
freq,1041056,4288


In [7]:
data.star_rating.unique()


array([5, 1, 4, 3, 2, nan, '5', '1', '3', '4', '2', '2012-12-21'],
      dtype=object)

In [8]:
data.groupby([TARGET_COL]).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,150441
2,97259
3,153660
4,259019
5,1040896
1,4566
2,3541
2012-12-21,0
3,5999
4,11411


In [9]:
# Drop the outlier which is star_rating = "2012-12-21"
data = data[data.star_rating != "2012-12-21"]


In [10]:
# Remove nan valued rows
data = data[data.star_rating.notnull()]


In [11]:
data.describe()


Unnamed: 0,star_rating,review_body
count,1767041,1766807
unique,10,1618522
top,5,Love it
freq,1041056,4288


In [12]:
# Convert all star rating to integer
data[TARGET_COL] = data.star_rating.astype(int)


In [13]:
data.head()


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [14]:
data = data[data.review_body.notnull()]
data.head()


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [15]:
# There are no empty reviews
(data.review_body.str.len() <= 0).sum()


0

Now we can continue with the process.


## We select 20000 reviews randomly from each rating class.


In [16]:
# np.random.seed(101)
N_SAMPLES = 25000
N_SAMPLES_ACTUAL = 20000


In [17]:
sampled_data = data.groupby(TARGET_COL, group_keys=False).apply(lambda x: x.sample(N_SAMPLES, random_state=RANDOM_SEED))
sampled_data.reset_index(inplace=True)
sampled_data.drop(columns=["index"], inplace=True)


In [18]:
sampled_data.head()

Unnamed: 0,star_rating,review_body
0,1,Too small even for the knuckles.
1,1,Did not fit right
2,1,This stupid kit has 16 gauge needles not 14gauge.
3,1,I would not suggest this item I bought the one...
4,1,I am sure that it will be lovely once I get it...


# Data Cleaning


In [19]:
avg_len_before_cleaning = sampled_data.review_body.str.len().mean()
f"Avg. length of reviews BEFORE CLEANING :: {avg_len_before_cleaning}"


'Avg. length of reviews BEFORE CLEANING :: 189.880568'

In [20]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [21]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [22]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [23]:
! pip install emot



In [24]:
# Handle emoji
def convert_emoji_to_txt(data: pd.Series):
  from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

  EMO_TO_TXT_DICT = dict()
  for emot in UNICODE_EMOJI:
    EMO_TO_TXT_DICT[emot] = f" {re.sub(r',|:|_', '', UNICODE_EMOJI[emot])} "

  for emo in EMOTICONS_EMO:
    EMO_TO_TXT_DICT[emot] = f" {re.sub(r',| ', '', EMOTICONS_EMO[emo])} "

  def convert_emojis(text, emo_to_txt_dict):
    for emot in emo_to_txt_dict:
        text = text.replace(emot, emo_to_txt_dict[emot])
    return text

  return data.apply(lambda x: convert_emojis(x, EMO_TO_TXT_DICT))

In [25]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z\s]", " ", regex=True)


In [26]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [27]:
# Install contractions package, if you don't have it
! pip install contractions



In [28]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


In [29]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        convert_emoji_to_txt,
        to_lower,
        remove_accented_characters,
        remove_html_and_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = sampled_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[col] = temp_data.copy()


Starting: convert_emoji_to_txt
Ended: convert_emoji_to_txt
Starting: to_lower
Ended: to_lower
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_and_url
Ended: remove_html_and_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [30]:
avg_len_after_cleaning = cleaned_data.review_body.str.len().mean()
f"Avg. length of reviews after cleaning :: {avg_len_after_cleaning}"


'Avg. length of reviews after cleaning :: 185.460632'

In [31]:
f"Before Cleaning: {avg_len_before_cleaning} ;; After Cleaning: {avg_len_after_cleaning}"


'Before Cleaning: 189.880568 ;; After Cleaning: 185.460632'

In [32]:
# TODO: Remove Test Block
# Checkpoint: Cleaned Data
cleaned_data.to_csv(f"{DATA_PATH}/cleaned.tsv", sep="\t", index=False, encoding="UTF-8")


# Pre-processing


## remove the stop words


In [33]:
# TODO: Remove Test Block
# cleaned_data = pd.read_csv(f"{DATA_PATH}/cleaned.tsv", sep="\t")


In [34]:
avg_len_before_preprocessing = cleaned_data[DATA_COL].str.len().mean()
print(f"Avg. length of the reviews before preprocessing :: {avg_len_before_preprocessing}")


Avg. length of the reviews before preprocessing :: 185.460632


In [35]:
def tokenize(data: pd.Series):
    from nltk.tokenize import word_tokenize

    nltk.download("punkt")

    return data.apply(word_tokenize)


In [36]:
from typing import List, Set


def remove_stopwords(data: pd.Series):
    """Remove stop words using the NLTK stopwords dictionary

    Args:
        string (str): a document

    Returns:
        str: a document with stopwords removed
    """
    from nltk.corpus import stopwords

    nltk.download("stopwords")

    stopwords = set(stopwords.words())

    def remover(word_list: List[str], stopwords: Set[str]):
        return [word for word in word_list if not word in stopwords]

    return data.apply(lambda word_list: remover(word_list, stopwords))


## perform lemmatization


In [37]:
def lemmatize(data: pd.Series, consider_pos_tag: bool = True):
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    nltk.download("omw-1.4")

    # POS tagging
    def perform_nltk_pos_tag(data: pd.Series):
        from nltk import pos_tag

        nltk.download("averaged_perceptron_tagger")

        return data.apply(pos_tag)

    # Convert POS tag to wordnet pos tags
    def wordnet_pos_tagger(tag: str):
        if tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    lemmatizer = WordNetLemmatizer()
    lemmatized = list()

    if consider_pos_tag:
        pos_tagged_data = data.copy()
        pos_tagged_data = perform_nltk_pos_tag(data)

        for row in pos_tagged_data:

            lemmatized_row = list()

            if consider_pos_tag:
                for word, tag in row:
                    wordnet_pos_tag = wordnet_pos_tagger(tag)

                    if wordnet_pos_tag is None:
                        lemmatized_row.append(word)
                    else:
                        result = lemmatizer.lemmatize(word, wordnet_pos_tag)
                        lemmatized_row.append(lemmatizer.lemmatize(word, wordnet_pos_tag))

            lemmatized.append(lemmatized_row)
    else:
        for row in data:
            lemmatized_row = list()

            for word in row:
                lemmatized_row.append(lemmatizer.lemmatize(word))

            lemmatized.append(lemmatized_row)

    return pd.Series(lemmatized)


In [38]:
# Concatenate lemmatized sentences back into one sentence
def concatenate(data: pd.Series):
    return data.apply(lambda words: " ".join(words))


In [40]:
preprocessing_pipeline = {DATA_COL: [tokenize, lemmatize, concatenate]}

# Run the pipeline
preprocessed_data = cleaned_data.copy()

# Process all the cleaning instructions
for col, pipeline in preprocessing_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = preprocessed_data[col]

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")

        if func.__name__ == "lemmatize":
            temp_data = func(temp_data, consider_pos_tag=True)
        else:
            temp_data = func(temp_data)

        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    preprocessed_data[col] = temp_data.copy()


Starting: tokenize


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Ended: tokenize
Starting: lemmatize


[nltk_data] Downloading package omw-1.4 to /Users/aditya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aditya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Ended: lemmatize
Starting: concatenate
Ended: concatenate


In [41]:
preprocessed_data.head()

Unnamed: 0,star_rating,review_body
0,1,too small even for the knuckle
1,1,do not fit right
2,1,this stupid kit have gauge needle not gauge
3,1,i would not suggest this item i buy the one wi...
4,1,i be sure that it will be lovely once i get it...


In [42]:
# TODO: Remove Test Block
# CHECKPOINT
# Save lemmatized data
import pickle as pkl

with open(f"{DATA_PATH}/preprocessed.pkl", "wb") as file:
    pkl.dump(preprocessed_data, file)


In [43]:
avg_len_after_preprocessing = preprocessed_data[DATA_COL].str.len().mean()
print(f"Avg. length of the reviews after preprocessing :: {avg_len_after_preprocessing}")


Avg. length of the reviews after preprocessing :: 176.958256


In [44]:
f"Before Preprocessing: {avg_len_before_preprocessing} ;; After Preprocessing: {avg_len_after_preprocessing}"


'Before Preprocessing: 185.460632 ;; After Preprocessing: 176.958256'

# TF-IDF Feature Extraction


In [45]:
# TODO: Remove Test Block
# Load lemmatized data
# import pickle as pkl

# preprocessed_data = None
# with open(f"{DATA_PATH}/preprocessed.pkl", "rb") as file:
#     lemmatized_data = pkl.load(file)


In [46]:
preprocessed_data[preprocessed_data[DATA_COL].str.len() == 0].groupby(TARGET_COL).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,8
2,3
3,2
4,2
5,13


In [47]:
preprocessed_data[preprocessed_data[DATA_COL].isnull()]


Unnamed: 0,star_rating,review_body


In [48]:
preprocessed_data[DATA_COL].isnull().values.any(), preprocessed_data[DATA_COL].isnull().sum()


(False, 0)

In [49]:
# Drop empty strings
preprocessed_data = preprocessed_data[preprocessed_data[DATA_COL].str.len() != 0]
# Drop NA reviews
preprocessed_data.dropna(subset=[DATA_COL], inplace=True)


In [50]:
preprocessed_data[preprocessed_data[DATA_COL].str.len() == 0].groupby(TARGET_COL).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1


In [51]:
# TODO: Remove this block
# Retrieve checkpoint
# preprocessed_data = pd.read_csv(f"{DATA_PATH}/data.tsv", sep="\t")


In [52]:
preprocessed_data.groupby(['star_rating']).count()

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,24992
2,24997
3,24998
4,24998
5,24987


In [53]:
# Resample data.
data = preprocessed_data.groupby(TARGET_COL, group_keys=False).apply(lambda x: x.sample(N_SAMPLES_ACTUAL, random_state=RANDOM_SEED))
data.reset_index(inplace=True)
data.drop(columns=["index"], inplace=True)


In [54]:
data.groupby(['star_rating']).count()

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,20000
2,20000
3,20000
4,20000
5,20000


In [55]:
# TODO: Remove Checkpoint ...
data.to_csv(f"{DATA_PATH}/data.tsv", sep="\t", index=False)


In [70]:
# Split the data 80-20 split
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, stratify=data[TARGET_COL], random_state=RANDOM_SEED)


In [71]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)
vectorizer.fit(train[DATA_COL])

X_tfidf_train = vectorizer.transform(train[DATA_COL])
X_tfidf_test = vectorizer.transform(test[DATA_COL])
y_train = train[TARGET_COL]
y_test = test[TARGET_COL]


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True



### Helper Functions


In [72]:
def calc_metrics(y_true, y_pred):
    from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    f1 = f1_score(y_true, y_pred, average=None)

    print(f"{accuracy}")
    print(f"{np.mean(f1)}")

    for rating_precision, rating_recall, rating_f1 in zip(precision, recall, f1):
        print(f"{rating_precision},{rating_recall},{rating_f1}")

    print(f"{np.mean(precision)},{np.mean(recall)},{np.mean(f1)}")


# Perceptron


In [73]:
from sklearn.linear_model import Perceptron

clf = Perceptron(max_iter=2000, alpha=0.5, random_state=RANDOM_SEED)
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.57775
0.5766838129669472
0.7147243749086124,0.611,0.6588045016510546
0.5833407731452549,0.408375,0.4804235138414029
0.4440879120879121,0.6314375,0.5214451612903226
0.5387837837837838,0.498375,0.5177922077922078
0.6734391895737294,0.7395625,0.704953680259748
0.5908752066998584,0.57775,0.5766838129669472
Test...
0.42455
0.42184910144323656
0.5761255924170616,0.48625,0.5273861171366594
0.3409177095183921,0.22475,0.27090552960675
0.31042694335771387,0.45625,0.3694705941896953
0.35983153461437223,0.34175,0.35055776381587384
0.5697377581805523,0.61375,0.5909255024672042
0.4314079076176185,0.42455,0.42184910144323656


In [75]:
from sklearn.linear_model import Perceptron

clf = Perceptron()
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)


Train ...
0.56535
0.561401174764401
0.6802321913054451,0.6738125,0.6770071273823354
0.5302017907256448,0.4959375,0.5124975779887618
0.5773993808049536,0.326375,0.4170260341798435
0.4350141907390848,0.6993125,0.5363725701685004
0.7004854368932039,0.6313125,0.6641025641025642
0.5846665980936664,0.56535,0.561401174764401
Test...
0.41215
0.4059272069124276
0.5442942942942943,0.54375,0.5440220110055027
0.3303080308030803,0.30025,0.3145625982189628
0.35642201834862386,0.19425,0.25145631067961166
0.32144997004194126,0.5365,0.4020232296740352
0.5535307517084282,0.486,0.5175718849840255
0.4212010130392736,0.4121499999999999,0.4059272069124276


# SVM


In [76]:
class_weight = {1: 0.90, 2: 1.9, 3: 1.75, 4: 1.2, 5: 0.65}

In [79]:
from sklearn.svm import LinearSVC

clf = LinearSVC(dual=False, C=0.1, max_iter=10000, class_weight='balanced', random_state=RANDOM_SEED)
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.599125
0.5893862234329305
0.6171729851870449,0.7681875,0.6844493944034525
0.5469493028129282,0.424125,0.4777695638398986
0.5448294829482948,0.49525,0.5188580408590886
0.5727627031399367,0.4868125,0.5263015642420352
0.6726388533401587,0.82125,0.7395525538201774
0.5908706654856726,0.599125,0.5893862234329305
Test...
0.5256
0.5117807462805299
0.568359375,0.7275,0.6381578947368421
0.4157190635451505,0.31075,0.35565092989985697
0.44,0.396,0.41684210526315785
0.4800117577895356,0.40825,0.44123209943258584
0.6427986906710311,0.7855,0.7070207020702071
0.5093777774011434,0.5256000000000001,0.5117807462805299


# Logistic Regression


In [80]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(penalty='l2', solver="saga", max_iter=1000, multi_class="multinomial", C=0.25055, random_state=RANDOM_SEED)

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.574475
0.569812049284734
0.6268805862432687,0.70575,0.6639814188692559
0.49181811851542373,0.4414375,0.4652679424261388
0.4925078110055474,0.48275,0.4875800902692296
0.5383646918934042,0.4810625,0.5081031125193913
0.6903547546186104,0.761375,0.7241276823396541
0.5679851924552508,0.5744750000000001,0.569812049284734
Test...
0.5266
0.5212038884712324
0.5941072999120492,0.6755,0.6321946654188114
0.41654939487756826,0.37,0.3918972593671389
0.4320548641097282,0.42525,0.4286254252236361
0.47793505412156534,0.4305,0.4529790872024201
0.6714842853865566,0.73175,0.700323005144156
0.5184261796814935,0.5266,0.5212038884712324


In [81]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(solver="lbfgs", max_iter=500, multi_class="multinomial")

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)


Train ...
0.610725
0.6076220907036431
0.6648822886087608,0.7219375,0.6922362388757379
0.5387675296655879,0.4994375,0.5183575505967827
0.5331717274815004,0.522375,0.5277181462305847
0.5759972954699121,0.5324375,0.5533614810003248
0.7178140689018409,0.7774375,0.7464370368147859
0.6061265820255204,0.6107250000000001,0.6076220907036431
Test...
0.5238
0.5194748528355257
0.6025466120964075,0.6625,0.6311026434865444
0.4092133620689655,0.37975,0.39393153526970953
0.4234746639089969,0.4095,0.41637010676156583
0.47268793942671716,0.437,0.454143933489218
0.675531914893617,0.73025,0.701826045170591
0.5166908984789408,0.5237999999999999,0.5194748528355257


# Naive Bayes


In [82]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=1.04)
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.5878875
0.5874669978859853
0.6665835411471321,0.66825,0.6674157303370787
0.5202026599113363,0.513375,0.5167662787039948
0.5077169654517393,0.5345625,0.5207940084028496
0.5377798507462687,0.5044375,0.5205753353973168
0.7048909046334886,0.7188125,0.7117836365886866
0.587434784377993,0.5878875,0.5874669978859853
Test...
0.4985
0.4983753737512611
0.6061361935644799,0.6075,0.6068173305031839
0.3893225887685722,0.3865,0.38790615982938154
0.39511732638065894,0.41675,0.40564545565153914
0.435646186440678,0.41125,0.42309670781893005
0.6663354037267081,0.6705,0.668411214953271
0.4985115397762194,0.49850000000000005,0.4983753737512611


In [83]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB()
clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)


Train ...
0.5887625
0.5882957233669669
0.6673314221612863,0.66925,0.6682893340822568
0.5219679198630571,0.5145625,0.518238756176628
0.5084635029993467,0.5350625,0.5214240034107867
0.5388874066168623,0.5049375,0.5213603510583377
0.7045009784735812,0.72,0.712166172106825
0.5882302460228267,0.5887625,0.5882957233669669
Test...
0.49855
0.4983208029153059
0.6056302939711011,0.60775,0.6066882954829049
0.3897021706208985,0.386,0.3878422506907812
0.39540393271736557,0.41725,0.4060333292786766
0.4353597026811787,0.41,0.4222994721256599
0.6657581764122894,0.67175,0.6687406669985068
0.4983708552805666,0.49855,0.4983208029153059
