In [1]:
# TODO: Remove this before submission
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"


In [2]:
import pandas as pd
import numpy as np
import nltk

nltk.download("wordnet")
import re
from bs4 import BeautifulSoup


[nltk_data] Downloading package wordnet to /Users/aditya/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Jewelry_v1_00.tsv.gz



In [4]:
DATA_PATH = "./data"
DATA_FILE = "amazon_reviews_us_Jewelry_v1_00.tsv"

DATA_COL = "review_body"
TARGET_COL = "star_rating"

RANDOM_SEED = 42

## Read Data


## Keep Reviews and Ratings


In [5]:
# Load the tab separated data file, and print the first 5 rows for confirmation
data = pd.read_csv(f"{DATA_PATH}/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=[TARGET_COL, DATA_COL])
data.head()


  data = pd.read_csv(f"{DATA_PATH}/amazon_reviews_us_Jewelry_v1_00.tsv", sep="\t", usecols=[TARGET_COL, DATA_COL])


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


Understanding Data


In [6]:
data.describe()


Unnamed: 0,star_rating,review_body
count,1767042,1766807
unique,11,1618522
top,5,Love it
freq,1041056,4288


In [7]:
data.star_rating.unique()


array([5, 1, 4, 3, 2, nan, '5', '1', '3', '4', '2', '2012-12-21'],
      dtype=object)

In [8]:
data.groupby([TARGET_COL]).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,150441
2,97259
3,153660
4,259019
5,1040896
1,4566
2,3541
2012-12-21,0
3,5999
4,11411


In [9]:
# Drop the outlier which is star_rating = "2012-12-21"
data = data[data.star_rating != "2012-12-21"]


In [10]:
# Remove nan valued rows
data = data[data.star_rating.notnull()]


In [11]:
data.describe()


Unnamed: 0,star_rating,review_body
count,1767041,1766807
unique,10,1618522
top,5,Love it
freq,1041056,4288


In [12]:
# Convert all star rating to integer
data[TARGET_COL] = data.star_rating.astype(int)


In [13]:
data.head()


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [14]:
data = data[data.review_body.notnull()]
data.head()


Unnamed: 0,star_rating,review_body
0,5,so beautiful even tho clearly not high end ......
1,5,"Great product.. I got this set for my mother, ..."
2,5,Exactly as pictured and my daughter's friend l...
3,5,Love it. Fits great. Super comfortable and nea...
4,5,Got this as a Mother's Day gift for my Mom and...


In [15]:
# There are no empty reviews
(data.review_body.str.len() <= 0).sum()


0

Now we can continue with the process.


## We select 20000 reviews randomly from each rating class.


In [16]:
# np.random.seed(101)
N_SAMPLES = 25000
N_SAMPLES_ACTUAL = 20000


In [17]:
sampled_data = data.groupby(TARGET_COL, group_keys=False).apply(lambda x: x.sample(N_SAMPLES, random_state=RANDOM_SEED))
sampled_data.reset_index(inplace=True)
sampled_data.drop(columns=["index"], inplace=True)


In [18]:
sampled_data.head()

Unnamed: 0,star_rating,review_body
0,1,Too small even for the knuckles.
1,1,Did not fit right
2,1,This stupid kit has 16 gauge needles not 14gauge.
3,1,I would not suggest this item I bought the one...
4,1,I am sure that it will be lovely once I get it...


# Data Cleaning


In [19]:
avg_len_before_cleaning = sampled_data.review_body.str.len().mean()
f"Avg. length of reviews BEFORE CLEANING :: {avg_len_before_cleaning}"


'Avg. length of reviews BEFORE CLEANING :: 189.880568'

In [20]:
# Convert all reviews to lower case (optional according to study)
def to_lower(data: pd.Series):
    return data.str.lower()


In [21]:
def remove_accented_characters(data: pd.Series):
    import unicodedata

    """Removes accented characters from the Series

    Args:
        data (pd.Series): Series of string

    Returns:
        _type_: pd.Series
    """
    import unicodedata

    return data.apply(lambda x: unicodedata.normalize("NFKD", x).encode("ascii", "ignore").decode("utf-8", "ignore"))


In [22]:
def remove_html_and_url(data: pd.Series):
    """Function to remove
             1. HTML encodings
             2. HTML tags (both closed and open)
             3. URLs

    Args:
        data (pd.Series): A Pandas series of type string

    Returns:
        _type_: pd.Series
    """
    # Remove HTML encodings
    data.str.replace(r"&#\d+;", " ", regex=True)

    # Remove HTML tags (both open and closed)
    data.str.replace(r"<[a-zA-Z]+\s?/?>", " ", regex=True)

    # Remove URLs
    data.str.replace(r"https?://([\w\-\._]+){2,}/[\w\-\.\-/=\+_\?]+", " ", regex=True)

    return data


In [23]:
! pip install emot



In [24]:
# Handle emoji
def convert_emoji_to_txt(data: pd.Series):
  from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

  EMO_TO_TXT_DICT = dict()
  for emot in UNICODE_EMOJI:
    EMO_TO_TXT_DICT[emot] = f" {re.sub(r',|:|_', '', UNICODE_EMOJI[emot])} "

  for emo in EMOTICONS_EMO:
    EMO_TO_TXT_DICT[emot] = f" {re.sub(r',| ', '', EMOTICONS_EMO[emo])} "

  def convert_emojis(text, emo_to_txt_dict):
    for emot in emo_to_txt_dict:
        text = text.replace(emot, emo_to_txt_dict[emot])
    return text

  return data.apply(lambda x: convert_emojis(x, EMO_TO_TXT_DICT))

In [25]:
# Remove non-alphabetical characters
def remove_non_alpha_characters(data: pd.Series):
    return data.str.replace(r"_+|\\|[^a-zA-Z0-9\s]", " ", regex=True)


In [26]:
# Remove extra spaces
def remove_extra_spaces(data: pd.Series):
    return data.str.replace(r"^\s*|\s\s*", " ", regex=True)


In [27]:
# Install contractions package, if you don't have it
! pip install contractions



In [28]:
# Expanding contractions
def fix_contractions(data: pd.Series):
    import contractions

    def contraction_fixer(txt: str):
        return " ".join([contractions.fix(word) for word in txt.split()])

    return data.apply(contraction_fixer)


In [29]:
# A dictionary containing the columns and a list of functions to perform on it in order
data_cleaning_pipeline = {
    DATA_COL: [
        convert_emoji_to_txt,
        to_lower,
        remove_accented_characters,
        remove_html_and_url,
        fix_contractions,
        remove_non_alpha_characters,
        remove_extra_spaces,
    ]
}

cleaned_data = sampled_data.copy()

# Process all the cleaning instructions
for col, pipeline in data_cleaning_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = cleaned_data[col].copy()

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")
        temp_data = func(temp_data)
        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    cleaned_data[col] = temp_data.copy()


Starting: convert_emoji_to_txt
Ended: convert_emoji_to_txt
Starting: to_lower
Ended: to_lower
Starting: remove_accented_characters
Ended: remove_accented_characters
Starting: remove_html_and_url
Ended: remove_html_and_url
Starting: fix_contractions
Ended: fix_contractions
Starting: remove_non_alpha_characters
Ended: remove_non_alpha_characters
Starting: remove_extra_spaces
Ended: remove_extra_spaces


In [30]:
avg_len_after_cleaning = cleaned_data.review_body.str.len().mean()
f"Avg. length of reviews after cleaning :: {avg_len_after_cleaning}"


'Avg. length of reviews after cleaning :: 186.475896'

In [31]:
f"Before Cleaning: {avg_len_before_cleaning} ;; After Cleaning: {avg_len_after_cleaning}"


'Before Cleaning: 189.880568 ;; After Cleaning: 186.475896'

In [32]:
# TODO: Remove Test Block
# Checkpoint: Cleaned Data
cleaned_data.to_csv(f"{DATA_PATH}/cleaned.tsv", sep="\t", index=False, encoding="UTF-8")


# Pre-processing


## remove the stop words


In [33]:
# TODO: Remove Test Block
# cleaned_data = pd.read_csv(f"{DATA_PATH}/cleaned.tsv", sep="\t")


In [34]:
avg_len_before_preprocessing = cleaned_data[DATA_COL].str.len().mean()
print(f"Avg. length of the reviews before preprocessing :: {avg_len_before_preprocessing}")


Avg. length of the reviews before preprocessing :: 186.475896


In [35]:
def tokenize(data: pd.Series):
    from nltk.tokenize import word_tokenize

    nltk.download("punkt")

    return data.apply(word_tokenize)


In [36]:
from typing import List, Set


def remove_stopwords(data: pd.Series):
    """Remove stop words using the NLTK stopwords dictionary

    Args:
        string (str): a document

    Returns:
        str: a document with stopwords removed
    """
    from nltk.corpus import stopwords

    nltk.download("stopwords")

    stopwords = set(stopwords.words())

    def remover(word_list: List[str], stopwords: Set[str]):
        return [word for word in word_list if not word in stopwords]

    return data.apply(lambda word_list: remover(word_list, stopwords))


## perform lemmatization


In [37]:
def lemmatize(data: pd.Series, consider_pos_tag: bool = True):
    from nltk.corpus import wordnet
    from nltk.stem import WordNetLemmatizer

    nltk.download("omw-1.4")

    # POS tagging
    def perform_nltk_pos_tag(data: pd.Series):
        from nltk import pos_tag

        nltk.download("averaged_perceptron_tagger")

        return data.apply(pos_tag)

    # Convert POS tag to wordnet pos tags
    def wordnet_pos_tagger(tag: str):
        if tag.startswith("J"):
            return wordnet.ADJ
        elif tag.startswith("V"):
            return wordnet.VERB
        elif tag.startswith("N"):
            return wordnet.NOUN
        elif tag.startswith("R"):
            return wordnet.ADV
        else:
            return None

    lemmatizer = WordNetLemmatizer()
    lemmatized = list()

    if consider_pos_tag:
        pos_tagged_data = data.copy()
        pos_tagged_data = perform_nltk_pos_tag(data)

        for row in pos_tagged_data:

            lemmatized_row = list()

            if consider_pos_tag:
                for word, tag in row:
                    wordnet_pos_tag = wordnet_pos_tagger(tag)

                    if wordnet_pos_tag is None:
                        lemmatized_row.append(word)
                    else:
                        result = lemmatizer.lemmatize(word, wordnet_pos_tag)
                        lemmatized_row.append(lemmatizer.lemmatize(word, wordnet_pos_tag))

            lemmatized.append(lemmatized_row)
    else:
        for row in data:
            lemmatized_row = list()

            for word in row:
                lemmatized_row.append(lemmatizer.lemmatize(word))

            lemmatized.append(lemmatized_row)

    return pd.Series(lemmatized)


In [38]:
# Concatenate lemmatized sentences back into one sentence
def concatenate(data: pd.Series):
    return data.apply(lambda words: " ".join(words))


In [39]:
preprocessing_pipeline = {DATA_COL: [tokenize, lemmatize, concatenate]}

# Run the pipeline
preprocessed_data = cleaned_data.copy()

# Process all the cleaning instructions
for col, pipeline in preprocessing_pipeline.items():
    # Get the column to perform cleaning on
    temp_data = preprocessed_data[col]

    # Perform all the cleaning functions sequencially
    for func in pipeline:
        print(f"Starting: {func.__name__}")

        if func.__name__ == "lemmatize":
            temp_data = func(temp_data, consider_pos_tag=True)
        else:
            temp_data = func(temp_data)

        print(f"Ended: {func.__name__}")

    # Replace the old column with cleaned one.
    preprocessed_data[col] = temp_data.copy()


Starting: tokenize


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Ended: tokenize
Starting: lemmatize


[nltk_data] Downloading package omw-1.4 to /Users/aditya/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/aditya/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Ended: lemmatize
Starting: concatenate
Ended: concatenate


In [40]:
preprocessed_data.head()

Unnamed: 0,star_rating,review_body
0,1,too small even for the knuckle
1,1,do not fit right
2,1,this stupid kit have 16 gauge needle not 14gauge
3,1,i would not suggest this item i buy the one wi...
4,1,i be sure that it will be lovely once i get it...


In [41]:
# TODO: Remove Test Block
# CHECKPOINT
# Save lemmatized data
import pickle as pkl

with open(f"{DATA_PATH}/preprocessed.pkl", "wb") as file:
    pkl.dump(preprocessed_data, file)


In [42]:
avg_len_after_preprocessing = preprocessed_data[DATA_COL].str.len().mean()
print(f"Avg. length of the reviews after preprocessing :: {avg_len_after_preprocessing}")


Avg. length of the reviews after preprocessing :: 177.973472


In [43]:
f"Before Preprocessing: {avg_len_before_preprocessing} ;; After Preprocessing: {avg_len_after_preprocessing}"


'Before Preprocessing: 186.475896 ;; After Preprocessing: 177.973472'

# TF-IDF Feature Extraction


In [44]:
# TODO: Remove Test Block
# Load lemmatized data
# import pickle as pkl

# preprocessed_data = None
# with open(f"{DATA_PATH}/preprocessed.pkl", "rb") as file:
#     lemmatized_data = pkl.load(file)


In [45]:
preprocessed_data[preprocessed_data[DATA_COL].str.len() == 0].groupby(TARGET_COL).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,6
2,2
3,2
4,2
5,10


In [46]:
preprocessed_data[preprocessed_data[DATA_COL].isnull()]


Unnamed: 0,star_rating,review_body


In [47]:
preprocessed_data[DATA_COL].isnull().values.any(), preprocessed_data[DATA_COL].isnull().sum()


(False, 0)

In [48]:
# Drop empty strings
preprocessed_data = preprocessed_data[preprocessed_data[DATA_COL].str.len() != 0]
# Drop NA reviews
preprocessed_data.dropna(subset=[DATA_COL], inplace=True)


In [49]:
preprocessed_data[preprocessed_data[DATA_COL].str.len() == 0].groupby(TARGET_COL).count()


Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1


In [50]:
# TODO: Remove this block
# Retrieve checkpoint
# preprocessed_data = pd.read_csv(f"{DATA_PATH}/data.tsv", sep="\t")


In [51]:
preprocessed_data.groupby(['star_rating']).count()

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,24994
2,24998
3,24998
4,24998
5,24990


In [52]:
# Resample data.
data = preprocessed_data.groupby(TARGET_COL, group_keys=False).apply(lambda x: x.sample(N_SAMPLES_ACTUAL, random_state=RANDOM_SEED))
data.reset_index(inplace=True)
data.drop(columns=["index"], inplace=True)


In [53]:
data.groupby(['star_rating']).count()

Unnamed: 0_level_0,review_body
star_rating,Unnamed: 1_level_1
1,20000
2,20000
3,20000
4,20000
5,20000


In [54]:
# TODO: Remove Checkpoint ...
data.to_csv(f"{DATA_PATH}/data.tsv", sep="\t", index=False)


In [55]:
# Split the data 80-20 split
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, stratify=data[TARGET_COL], random_state=RANDOM_SEED)


In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

nltk.download("punkt")

vectorizer = TfidfVectorizer(tokenizer=word_tokenize)
vectorizer.fit(train[DATA_COL])

X_tfidf_train = vectorizer.transform(train[DATA_COL])
X_tfidf_test = vectorizer.transform(test[DATA_COL])
y_train = train[TARGET_COL]
y_test = test[TARGET_COL]


[nltk_data] Downloading package punkt to /Users/aditya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True



### Helper Functions


In [57]:
def calc_metrics(y_true, y_pred):
    from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average=None)
    recall = recall_score(y_true, y_pred, average=None)
    f1 = f1_score(y_true, y_pred, average=None)

    print(f"{accuracy}")
    print(f"{np.mean(f1)}")

    for rating_precision, rating_recall, rating_f1 in zip(precision, recall, f1):
        print(f"{rating_precision},{rating_recall},{rating_f1}")

    print(f"{np.mean(precision)},{np.mean(recall)},{np.mean(f1)}")


# Perceptron


In [165]:
from sklearn.linear_model import Perceptron

# clf = Perceptron(max_iter=1000, alpha=0.5, random_state=RANDOM_SEED) # 0.43115027900835684
# clf = Perceptron(max_iter=1000, alpha=0.5, random_state=RANDOM_SEED, tol=1e-5) # 0.4336870195755223
# clf = Perceptron(max_iter=4000, alpha=0.5, random_state=RANDOM_SEED, tol=1e-5, early_stopping=True) # 0.4343847796358854
# clf = Perceptron(max_iter=8000, alpha=0.01, random_state=RANDOM_SEED, tol=1e-5, early_stopping=True, class_weight="balanced") # 0.4343847796358854

class_weight = {1: 0.29525, 2: 16.45725, 3: 8.6525, 4: 1, 5: 0.8585} # 0.45103975891511683
clf = Perceptron(max_iter=8000, alpha=0.01, random_state=RANDOM_SEED, tol=1e-5, early_stopping=True, class_weight=class_weight) # 0.45103975891511683

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.4431125
0.4294833112028913
0.8398026315789474,0.1595625,0.268172268907563
0.3341968205491779,0.766,0.46536176029464815
0.370390726491456,0.4633125,0.4116732381851502
0.5343833185448092,0.301125,0.38519347617524785
0.7470018655059074,0.5255625,0.6170158124518473
0.5651550725340596,0.44311249999999996,0.4294833112028913
Test...
0.35425
0.3388449426900742
0.6776034236804565,0.11875,0.20208466283769408
0.28262040728369786,0.65575,0.39500037647767483
0.27497062279670975,0.351,0.30836810893916106
0.3673843334860284,0.2005,0.2594209930454472
0.6526200073286919,0.44525,0.5293505721503938
0.45103975891511683,0.35425,0.3388449426900742


In [134]:
from sklearn.linear_model import Perceptron

clf = Perceptron(max_iter=2000, alpha=0.5, random_state=RANDOM_SEED) # 0.43115027900835684

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.5876375
0.5860666626877278
0.6852594882588963,0.651125,0.6677563054834471
0.5656843371423662,0.4115,0.47642823546438
0.4686575708424769,0.614,0.5315729668307992
0.5318808074335148,0.51875,0.525233349153615
0.7163522391658128,0.7428125,0.7293424565063974
0.5935668885686134,0.5876375,0.5860666626877278
Test...
0.42985
0.42713987642525686
0.538232104121475,0.49625,0.5163891779396462
0.3286467486818981,0.23375,0.2731921110299489
0.332143528085666,0.442,0.3792770567413923
0.3561473369835739,0.35775,0.35694686954352706
0.6005816771691711,0.6195,0.6098941668717697
0.43115027900835684,0.42984999999999995,0.42713987642525686


# SVM


In [62]:
from sklearn.svm import LinearSVC

# class_weight = {1: 0.95, 2: 1.7, 3: 1.65, 4: 1, 5: 0.6525} # 0.5250080879095086
# class_weight = {1: 0.95, 2: 1.65, 3: 1.85, 4: 1, 5: 0.6525} # 0.5265625011400694
# class_weight = {1: 0.95, 2: 1.725, 3: 2.85, 4: 1, 5: 0.6525} # 0.5283863569082248
# class_weight = {1: 0.95, 2: 1.725, 3: 2.85, 4: 1, 5: 0.6525} # 0.5298186572543087
# class_weight = {1: 0.95, 2: 1.725, 3: 2.925, 4: 1, 5: 0.8525} # 0.5298707666039373
# class_weight = {1: 0.95, 2: 1.725, 3: 2.925, 4: 1, 5: 0.9585} # 0.5305190760270104
# class_weight = {1: 0.95, 2: 1.8525, 3: 2.925, 4: 1, 5: 0.9585} # 0.5318580010845554
# class_weight = {1: 0.95, 2: 1.8525, 3: 3.3525, 4: 1, 5: 0.9585} # 0.5328127772891916
# class_weight = {1: 0.95, 2: 1.9725, 3: 3.3525, 4: 1, 5: 0.9585} # 0.534678946516937
# class_weight = {1: 0.95, 2: 2.3725, 3: 3.3525, 4: 1, 5: 0.9585} # 0.5378650442928313
# class_weight = {1: 0.95, 2: 2.85725, 3: 3.5525, 4: 1, 5: 0.9585} # 0.5403434839237587
# class_weight = {1: 0.95, 2: 4.85725, 3: 3.8525, 4: 1, 5: 0.9585} # 0.5474152267990974
# class_weight = {1: 0.7625, 2: 4.85725, 3: 3.8525, 4: 1, 5: 0.9585} # 0.551693399374575
# class_weight = {1: 0.7625, 2: 5.25725, 3: 3.9525, 4: 1, 5: 0.9585} # 0.5519983833002682
# class_weight = {1: 0.7625, 2: 5.95725, 3: 3.9525, 4: 1, 5: 0.9585} # 0.5530668437190186
# class_weight = {1: 0.5625, 2: 5.95725, 3: 4.2525, 4: 1, 5: 0.9585} # 0.5634729752537446
# class_weight = {1: 0.5625, 2: 6.95725, 3: 4.6525, 4: 1, 5: 0.8585} # 0.5656444257835423
# class_weight = {1: 0.2625, 2: 6.95725, 3: 4.6525, 4: 1, 5: 0.8585} # 0.5732672392507041
class_weight = {1: 0.29525, 2: 7.45725, 3: 4.6525, 4: 1, 5: 0.8585} # 0.5746383318772281
# class_weight = {1: 0.89525, 2: 9.85725, 3: 6.7525, 4: 1, 5: 0.8585} # 

clf = LinearSVC(dual=False, C=0.1, max_iter=200, class_weight=class_weight, random_state=RANDOM_SEED) # 0.575300476768289
# clf = LinearSVC(dual=False, C=0.555, max_iter=5, class_weight=class_weight, random_state=RANDOM_SEED)

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.476075
0.4241741907355479
0.9487895716945997,0.0636875,0.1193627738081293
0.36066219944816713,0.8578125,0.5078161132180188
0.4344088924890869,0.6033125,0.5051149891420947
0.6810945273631841,0.171125,0.2735264735264735
0.7485304169514696,0.6844375,0.7150506039830232
0.6346971215893015,0.47607499999999997,0.4241741907355479
Test...
0.418
0.3723706346339909
0.918918918918919,0.0595,0.11176332472411364
0.3166392431098313,0.76975,0.44870300204022145
0.35155822854018587,0.48225,0.4066617476546853
0.567741935483871,0.132,0.21419878296146044
0.7183333333333334,0.6465,0.6805263157894736
0.5746383318772281,0.41800000000000004,0.3723706346339909


# Logistic Regression


In [89]:
from sklearn.linear_model import LogisticRegression

class_weight = {1: 1, 2: 1, 3: 1, 4: 1, 5: 1.1024} # 0.5172256481829293

clf = LogisticRegression(penalty='l2', solver="saga", max_iter=100, multi_class="multinomial", C=0.25055, random_state=RANDOM_SEED, class_weight=class_weight)

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.5774125
0.5712491349841665
0.6285777802478741,0.706875,0.6654311181713882
0.4975791172549295,0.4431875,0.46881094839840004
0.4979255760515734,0.4875625,0.49268955063630915
0.551932547380988,0.4623125,0.5031630501326441
0.6739444533633007,0.787125,0.7261510075820913
0.5699918948597331,0.5774125,0.5712491349841665
Test...
0.5264
0.5197525668214037
0.5867528991971455,0.65775,0.6202263083451202
0.4094143404488232,0.374,0.3909067154429057
0.43644284982060483,0.42575,0.43103011895722604
0.49355708720407554,0.41175,0.4489573395120622
0.6599610642439974,0.76275,0.7076423518497044
0.5172256481829293,0.5264,0.5197525668214037


# Naive Bayes


In [132]:
from sklearn.naive_bayes import MultinomialNB

# clf = MultinomialNB(alpha=1.064) # 0.4952600291865206
# clf = MultinomialNB(alpha=2.5) # 0.5024158722296946
# clf = MultinomialNB(alpha=3.9086) # 0.5065314917640114
# clf = MultinomialNB(alpha=32.7886) # 0.5124296411150511
clf = MultinomialNB(alpha=34.89886) # 0.5126996219647234
# clf = MultinomialNB(alpha=34.89986)

clf.fit(X_tfidf_train, y_train)

y_pred = clf.predict(X_tfidf_test)

print("Train ...")
calc_metrics(y_train, clf.predict(X_tfidf_train))
print("Test...")
calc_metrics(y_test, y_pred)

Train ...
0.5163
0.52292294526062
0.6661536093581653,0.541,0.5970890529074981
0.42018348623853213,0.5009375,0.4570206699928725
0.43624426962476653,0.48175,0.4578692565861772
0.44507336374127604,0.5061875,0.47366728075562187
0.7315375051802735,0.551625,0.62896846606093
0.5398384468286027,0.5163,0.52292294526062
Test...
0.4859
0.49344447326009827
0.6413421968977524,0.5065,0.5660008381058806
0.3771255060728745,0.46575,0.41677852348993283
0.40835425701894545,0.44725,0.4269180288748359
0.42214912280701755,0.48125,0.4497663551401869
0.714527027027027,0.52875,0.6077586206896551
0.5126996219647234,0.4859,0.49344447326009827
