In [1]:
from collections import defaultdict
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import nltk
from nltk.tokenize import regexp_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import re
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import roc_auc_score, plot_roc_curve, plot_precision_recall_curve
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import xgboost as xgb

# Testing out different applications of nlp

**Notebook dedicated to find the best nlp model for the car reviews**
- Trying out lemmantizing
- Trying out stemming
- Used tfidvectorizer and a countingvectorizer
- See which model performed the best and keep it to use with the other model

In [2]:
data = pd.read_csv('CleanData3bins.csv')

In [3]:
data = data.drop(columns='Unnamed: 0')

In [4]:
data

Unnamed: 0,DriveTrain,Mileage,SalePrice,PriceCategory,CleanReviews,Year,newSize,newCyl,newMake,mpg
0,FWD,53200.0,29000.0,Average,Virtually nothing has gone wrong with my 2020 ...,2020.0,2.5,4,Toyota,27.0
1,RWD,22690.0,55975.0,Expensive,Stranded today. Could not get to work. Somethi...,2016.0,3.0,6,Volkswagen,18.0
2,FWD,17854.0,38900.0,Average,"Excellent road car, quiet, stable, comfortable...",2020.0,3.5,6,Other,18.0
3,FWD,60907.0,22125.0,Cheap,"PURCHASED FROM NYE TOYOTA, MY FIRST TACOMA. LO...",2021.0,2.5,4,Toyota,28.0
4,FWD,41614.0,23946.0,Cheap,I’ve own a 2020 Altima (under 30k miles) for a...,2020.0,2.4,4,Nissan,24.0
...,...,...,...,...,...,...,...,...,...,...
8469,FWD,78378.0,25500.0,Cheap,Only had the X5 for 4 weeks now but so far it ...,2022.0,3.6,6,BMW,18.0
8470,AWD,30909.0,35590.0,Average,When I bought my 2020 Tacoma V6 (3.5L) 4x4 in ...,2019.0,3.6,6,Toyota,18.0
8471,FWD,30186.0,24000.0,Cheap,Everything is great except the rear view camer...,2015.0,2.5,4,INFINITI,23.0
8472,FWD,22309.0,71077.0,Expensive,I have owned smaller SUV\s and Trucks for over...,2020.0,5.7,8,Toyota,15.0


**Review column cleaning**
- make a base model for nlp

In [5]:
df = data[['CleanReviews', 'PriceCategory']]

In [6]:
df.head()

Unnamed: 0,CleanReviews,PriceCategory
0,Virtually nothing has gone wrong with my 2020 ...,Average
1,Stranded today. Could not get to work. Somethi...,Expensive
2,"Excellent road car, quiet, stable, comfortable...",Average
3,"PURCHASED FROM NYE TOYOTA, MY FIRST TACOMA. LO...",Cheap
4,I’ve own a 2020 Altima (under 30k miles) for a...,Cheap


In [7]:
rev_index = 100

In [8]:
df['CleanReviews'][rev_index]

'Hands down the best Jeep ever.. would take it over the Santa Fe any day of the week.. there’s no comparison.. performance is excellent.. have 70,000 miles on a 2018 and a 2019.. ac went out on one but warranty covered it🙂🙂, I absolutely love my jeep it’s comfortable reliable and completely rugged I never feel like I’m going to get stranded stuck or be taken down by anything you feel completely safe high up and well compact in this vehicle definitely worth the money I will never change my vehicle make!!! Jeep owner for GOOD!!! , have a 2019 high altitude , love this vehicle so comfortable and smooth love it, first vehicle i have loved driving. love the design and the smooth ride'

**Testing out the patter, make sure it's taking out the unwanted characters**

In [9]:
re.sub("[a-zA-Z]+(?:'[a-z]+)?", '', df['CleanReviews'][rev_index])

'     ..            .. ’  ..   ..  70,000    2018   2019..         🙂🙂,      ’          ’                                  !!!    !!! ,   2019   ,         ,      .       '

# Pre-proccessing

- just lowercase all the reviews, the tfidvectorizer will take care of the rest

In [10]:
df['CleanReviews'] = df['CleanReviews'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CleanReviews'] = df['CleanReviews'].str.lower()


In [11]:
df.head()

Unnamed: 0,CleanReviews,PriceCategory
0,virtually nothing has gone wrong with my 2020 ...,Average
1,stranded today. could not get to work. somethi...,Expensive
2,"excellent road car, quiet, stable, comfortable...",Average
3,"purchased from nye toyota, my first tacoma. lo...",Cheap
4,i’ve own a 2020 altima (under 30k miles) for a...,Cheap


- clean up reviews, 

In [12]:
#stopwords
sw = stopwords.words('english')

# Train, Test,  Split

- Creating a holdout set to use at the very end

In [13]:
X = data['CleanReviews']
y = data['PriceCategory']

In [14]:
X_t, X_hold, y_t, y_hold = train_test_split(X, y, test_size=0.10, random_state=42)

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X_t, y_t, test_size=0.25, random_state=42)

# Lemmatizing the text
- try lemmantizing
- try stemming


In [16]:
#https://github.com/flatiron-school/BSC-DS-2022/blob/main/Phase4/CompleteNotebooks/06-NaturalLanguage-Modeling_complete.ipynb
def get_wordnet_pos(treebank_tag):
    '''
    Translate nltk POS to wordnet tags
    '''
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [17]:
#https://github.com/flatiron-school/BSC-DS-2022/blob/main/Phase4/CompleteNotebooks/06-NaturalLanguage-Modeling_complete.ipynb
def doc_preparer(doc, stop_words=sw):
    '''
    
    :param doc: a document from the satire corpus 
    :return: a document string with words which have been 
            lemmatized, 
            parsed for stopwords, 
            made lowercase,
            and stripped of punctuation and numbers.
    '''
    
    regex_token = RegexpTokenizer(r"([a-zA-Z]+(?:’[a-z]+)?)")
    doc = regex_token.tokenize(doc)
    doc = [word.lower() for word in doc]
    doc = [word for word in doc if word not in sw]
#     print(doc)
    doc = pos_tag(doc)
    doc = [(word[0], get_wordnet_pos(word[1])) for word in doc]
    lemmatizer = WordNetLemmatizer() 
    doc = [lemmatizer.lemmatize(word[0], word[1]) for word in doc]
#     stemmer = nltk.stem.SnowballStemmer(language="english") #trying out stemmer instead of lemmatizer
#     doc = [stemmer.stem(word) for word in doc] #stemming words
    return ' '.join(doc)

In [18]:
token_docs = [doc_preparer(doc, sw) for doc in X_train]

In [19]:
X_train.iloc[rev_index]

'The ride is great, Toyota gives comfort and space, very good on gas and good price to own the car all you need is on this vehicle, This car rides nice and runs great it meets all of my driving needs. Would definitely buy another one. I needed a reliable car to get to work. Thank you, I went with the 2017 model due to the new 2018 + models having issues w/ that new 8 speed transmission. This was the last model w/  a bulletproof transmission so far. I hope to pay this car off and be payment free and problem free for the next decade. Always go with Toyota. They are the most reliable by statistics and no one wants to have car troubles.'

In [20]:
token_docs[rev_index]

'ride great toyota give comfort space good gas good price car need vehicle car ride nice run great meet drive need would definitely buy another one need reliable car get work thank go model due new model issue w new speed transmission last model w bulletproof transmission far hope pay car payment free problem free next decade always go toyota reliable statistic one want car trouble'

# Vectorize

In [32]:
vectorizer = TfidfVectorizer(
    token_pattern=r"([a-zA-Z]+(?:'[a-z]+)?)",
    stop_words=sw,
#     max_df=.95,  # removes words that appear in more than 95% of docs
#     min_df=2     # removes words that appear 2 or fewer times
)

#vectorizer = CountVectorizer()

In [33]:
vectorizer.fit(X_train)

X_train_vec = vectorizer.transform(X_train)
X_val_vec = vectorizer.transform(X_val)

In [34]:
classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)
classifier.score(X_train_vec, y_train)

0.4869732470711663

In [35]:
classifier.score(X_val_vec, y_val)

0.41321447299423175

In [37]:
print('lemmentizer with tfid vectorizer (hyper param of max_df and min_df) 0.41006816990036704')
print('lemmentizer with tfid vectorizer 0.41321447299423175') #best accuracy
print('stemmer with tfid vectorizer 0.41321447299423175')
print('stemming with count vectorizer 0.36392239119035136')

lemmentizer with tfid vectorizer (hyper param of max_df and min_df) 0.41006816990036704
lemmentizer with tfid vectorizer 0.41321447299423175
stemmer with tfid vectorizer 0.41321447299423175
stemming with count vectorizer 0.36392239119035136


## Conclusion

- stemming and lemmatizing text did not help
- probably because there is not enough variety in the reviews to make a difference
- just using a 'vanilla' tfid vectorizer was the best