## Import

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from function import Eddies_Tools
import numpy as np

from nltk import FreqDist
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

from sklearn.model_selection import GridSearchCV
from imblearn.under_sampling import RandomUnderSampler

## Setup

In [2]:
# loading in data
trade_chat = pd.read_csv('data\\trade_chat_v3.csv', index_col=0)

In [3]:
# dropping duplicated and resetting index
trade_chat.drop_duplicates(subset= 'text', inplace=True)
trade_chat.reset_index(drop=True, inplace=True)

In [4]:
# loading in stopwords and adding data related stop words.
s_words = stopwords.words('english')
s_words.extend(['u','ur','im','dont','thats'])

# instancing RegEx Tokenizer
tokenizer = RegexpTokenizer(r"(?u)\b([a-z]+|9.2)\w*\b")

# instancing my tools
et = Eddies_Tools()

In [5]:
# tokenizing the trade_chat dataframe
tc_tokenized = et.nlp_tokenizer(trade_chat, tokenizer, s_words)

## GridSearch

### Data Shaping

#### Simple Trade Chat Dataframe

In [15]:
# creating a copy of tokenized dataframe
simple_tc = tc_tokenized.copy()

In [16]:
# grouping multiple target labels into Game, Chat, and Service
simple_tc.target.mask(simple_tc.target.isin(['Boost', 'LFG','LFM','Trade']),'Service',inplace=True)
simple_tc.target.mask(simple_tc.target.isin(['Game', 'Patch','Bug']),'Game',inplace=True)

In [17]:
simple_tc.target.value_counts()

Game       1333
Chat        940
Service     578
Name: target, dtype: int64

In [24]:
simple_tc[simple_tc.target == 'Game'].sentiment.value_counts()

Other       929
Negative    404
Name: sentiment, dtype: int64

In [18]:
len(simple_tc) == len(simple_tc)

True

#### Under Sampled Trade Chat Dataframe

In [11]:
# Creating a copy of tokenized dataframe
under_tc = tc_tokenized.copy()

In [12]:
# under sampling the tokenized dataframe by sentiment
under_tc = et.down_sample(under_tc,'sentiment')

In [13]:
under_tc.sentiment.value_counts()

Negative    545
Other       545
Name: sentiment, dtype: int64

#### Basic Trade Chat Dataframe

In [14]:
# Creating a copy of tokenized dataframe
basic_tc = tc_tokenized.copy()

### Data Split

#### Sentiment Split

##### Simple Split (stc)

In [None]:
X_stc = simple_tc['joined_tokens']
y_stc_sent = simple_tc.sentiment.replace(['Negative','Other'],[1,0])
y_stc_target = simple_tc['target']

In [None]:
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_stc, 
                                                            y_stc_sent, 
                                                            test_size= 0.25, 
                                                            random_state=5)

##### Under Split (utc)

In [None]:
X_utc = under_tc['joined_tokens']
y_utc_sent = under_tc.sentiment.replace(['Negative','Other'],[1,0])
y_utc_target = under_tc['target']

##### Basic Split (btc)

In [None]:
X_btc = basic_tc['joined_tokens']
y_btc_sent = basic_tc.sentiment.replace(['Negative','Other'],[1,0])
y_btc_target = basic_tc['target']