# Machine Learning Embedded System
## Final Project -> Training notebook
## Author: Glauber Rodrigues Leite

In [1]:
import pandas as pd
import numpy as np
import nltk
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from service.preprocess import PreProcess

## Developing the recommendation classifier

### Reading and treating data

Reading CSV

In [2]:
data = pd.read_csv('steam_reviews.csv')


Assigning a number to recommendation field

In [3]:
for idx, recommendation in enumerate(data['recommendation']):
    if "Not Recommended" in recommendation:
        data['recommendation'][idx] = 0
    else:
        data['recommendation'][idx] = 1

In [4]:
data['recommendation'].value_counts()

1    6832
0    3740
Name: recommendation, dtype: int64

We have more "recommended" instances than "not_recommended" instances. So, we need to 

In [5]:
not_recommended = data[data['recommendation'] == 0]
recommended = data[data['recommendation'] == 1]
recommended = recommended.sample(len(not_recommended))
data = pd.concat([recommended, not_recommended])

In [6]:
data['recommendation'].value_counts()

1    3740
0    3740
Name: recommendation, dtype: int64

Cleaning reviews documents

In [7]:
data['review'] = data['review'].astype(str)
pre_processor = PreProcess(data, 'review')
data = pre_processor.clean_html()
data = pre_processor.remove_non_ascii()
data = pre_processor.remove_spaces()
data = pre_processor.remove_punctuation()
data = pre_processor.stemming()
data = pre_processor.lemmatization()
data = pre_processor.stop_words()

### Fitting TF-IDF vectorizer

In [41]:
train_x, test_x, train_y, test_y = train_test_split(data.review, data.recommendation, test_size=0.20)

In [42]:
tfidf_transformer = TfidfVectorizer(min_df=1)

In [43]:
train_vectors = tfidf_transformer.fit_transform(train_x)

In [44]:
joblib.dump(tfidf_transformer, 'service/tfidf_vectorizer.joblib')

[&#39;service/tfidf_vectorizer.joblib&#39;]

### Classification for recommendation

In [57]:
recommendation_clf = SVC(kernel="linear", C=0.025, probability=True)
recommendation_clf.fit(train_vectors, train_y.astype(int))

SVC(C=0.025, kernel=&#39;linear&#39;, probability=True)

In [58]:
joblib.dump(recommendation_clf, 'service/recommendation_clf.joblib')

[&#39;service/recommendation_clf.joblib&#39;]

## Building a classifier for helpful category

In [4]:
data = pd.read_csv('steam_reviews.csv')

In [5]:
data['helpful'] = data['helpful'].apply(lambda x: 1 if x > 0 else 0)

In [6]:
data['helpful'].value_counts()

0    9226
1    1346
Name: helpful, dtype: int64

In [7]:
helpful = data[data['helpful'] == 1]
not_helpful = data[data['helpful'] == 0]
not_helpful = not_helpful.sample(len(helpful))
data = pd.concat([helpful, not_helpful])
data['helpful'].value_counts()

1    1346
0    1346
Name: helpful, dtype: int64

In [8]:
data['review'] = data['review'].astype(str)
pre_processor = PreProcess(data, 'review')
data = pre_processor.clean_html()
data = pre_processor.remove_non_ascii()
data = pre_processor.remove_spaces()
data = pre_processor.remove_punctuation()
data = pre_processor.stemming()
data = pre_processor.lemmatization()
data = pre_processor.stop_words()

In [9]:
train_x, test_x, train_y, test_y = train_test_split(data.review, data.helpful, test_size=0.20)

In [10]:
tfidf_transformer = TfidfVectorizer(min_df=1)

In [11]:
train_vectors = tfidf_transformer.fit_transform(train_x)

In [12]:
joblib.dump(tfidf_transformer, 'service/tfidf_vectorizer_helpful.joblib')

[&#39;service/tfidf_vectorizer_helpful.joblib&#39;]

In [13]:
helpful_clf = KNeighborsClassifier(3)
helpful_clf.fit(train_vectors, train_y.astype(int))

KNeighborsClassifier(n_neighbors=3)

In [14]:
joblib.dump(helpful_clf, 'service/helpful_clf.joblib')

[&#39;service/helpful_clf.joblib&#39;]