In [53]:
import sys  
!{sys.executable} -m pip install contractions
!{sys.executable} -m pip install unidecode
!{sys.executable} -m pip install emoji

Collecting emoji
  Downloading emoji-1.2.0-py3-none-any.whl (131 kB)
Installing collected packages: emoji
Successfully installed emoji-1.2.0


In [2]:
import pandas as pd
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import numpy as np
%matplotlib inline

init_notebook_mode(connected=True)
import cufflinks as cf
cf.go_offline()

# Importing the dataset
dataset = pd.read_csv('train.csv', delimiter = ',',usecols=["text", "offense_rating"])

print(dataset)

                                                   text  offense_rating
0     TENNESSEE: We're the best state. Nobody even c...            0.20
1     A man inserted an advertisement in the classif...            1.10
2     How many men does it take to open a can of bee...            2.40
3     Told my mom I hit 1200 Twitter followers. She ...            0.00
4     Roses are dead. Love is fake. Weddings are bas...            0.10
...                                                 ...             ...
7995  Lack of awareness of the pervasiveness of raci...            0.25
7996    Why are aspirins white? Because they work sorry            3.85
7997  Today, we Americans celebrate our independence...            0.00
7998  How to keep the flies off the bride at an Ital...            3.00
7999  "Each ounce of sunflower seeds gives you 37% o...            0.00

[8000 rows x 2 columns]


In [3]:
print(len(dataset[dataset['offense_rating'] == '']))

0


There are no cells without an offense_rating value

In [4]:
dataset['offense_rating'].iplot(
    kind='hist',
    bins=50,
    xTitle='Offense Rating',
    linecolor='black',
    yTitle='count',
    title='Offense Rating Distribution')

### The offense rating mean:

In [5]:
mean = dataset['offense_rating'].mean()
print("Mean value for offense rating:", mean)

Mean value for offense rating: 0.5853250000000031


### The offense rating median:

In [6]:
median = dataset['offense_rating'].median()
print("Median value for offense rating:", median)

Median value for offense rating: 0.1


### The offense rating mode:

In [7]:
modeOffense = dataset['offense_rating'].mode()
print("Mode value for offense rating:", modeOffense)

Mode value for offense rating: 0    0.0
dtype: float64


### The offense rating variance:

In [8]:
variance = dataset['offense_rating'].var()
print("Variance value for offense rating:", variance)

Variance value for offense rating: 0.960312183397919


### The offense rating standard deviation:

In [9]:
std = dataset['offense_rating'].std()
print("Standard Deviation value for offense rating:", std)

Standard Deviation value for offense rating: 0.9799551945869357


### The offense rating skewness:

In [10]:
skewness = dataset['offense_rating'].skew()
print("Skewness value for offense rating:", skewness)

Skewness value for offense rating: 2.025170682712623


### Processing the data

#### To help normalize the text, it should all be converterd to lower case.

In [51]:
processed_data = dataset.copy()
processed_data["text"] = processed_data['text'].str.lower()
print(dataset)
print(processed_data)

                                                   text  offense_rating
0     TENNESSEE: We're the best state. Nobody even c...            0.20
1     A man inserted an advertisement in the classif...            1.10
2     How many men does it take to open a can of bee...            2.40
3     Told my mom I hit 1200 Twitter followers. She ...            0.00
4     Roses are dead. Love is fake. Weddings are bas...            0.10
...                                                 ...             ...
7995  Lack of awareness of the pervasiveness of raci...            0.25
7996    Why are aspirins white? Because they work sorry            3.85
7997  Today, we Americans celebrate our independence...            0.00
7998  How to keep the flies off the bride at an Ital...            3.00
7999  "Each ounce of sunflower seeds gives you 37% o...            0.00

[8000 rows x 2 columns]
                                                   text  offense_rating
0     tennessee: we're the best state. 

#### Then, the contractions, accents and emojis should be removed, since it doesn't add meaning to sentences:

In [55]:
import contractions
import unicodedata
import emoji

def contract(text):
    
    text = unicodedata.normalize('NFKD', text)
    text = "".join([c for c in text if not unicodedata.combining(c)])
    text = emoji.demojize(text)
    
    expanded_words = []    

    for word in text.split():
      # using contractions.fix to expand the shotened words
      expanded_words.append(contractions.fix(word))   
    
    expanded_text = ' '.join(expanded_words)
    return expanded_text

processed_data["text"] = list(map(contract, processed_data.text))
print(processed_data)


                                                   text  offense_rating
0     tennessee: we are the best state. nobody even ...            0.20
1     a man inserted an advertisement in the classif...            1.10
2     how many men does it take to open a can of bee...            2.40
3     told my mom i hit 1200 twitter followers. she ...            0.00
4     roses are dead. love is fake. weddings are bas...            0.10
...                                                 ...             ...
7995  lack of awareness of the pervasiveness of raci...            0.25
7996    why are aspirins white? because they work sorry            3.85
7997  today, we americans celebrate our independence...            0.00
7998  how to keep the flies off the bride at an ital...            3.00
7999  "each ounce of sunflower seeds gives you 37% o...            0.00

[8000 rows x 2 columns]


#### Removing punctuation:

In [44]:
import re

def remove_punctuation(text):  
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    return text

processed_data["text"] = list(map(remove_punctuation, processed_data.text))

print(processed_data)

                                                   text  offense_rating
0     tennessee  we are the best state  nobody even ...            0.20
1     a man inserted an advertisement in the classif...            1.10
2     how many men does it take to open a can of bee...            2.40
3     told my mom i hit 1200 twitter followers  she ...            0.00
4     roses are dead  love is fake  weddings are bas...            0.10
...                                                 ...             ...
7995  lack of awareness of the pervasiveness of raci...            0.25
7996    why are aspirins white  because they work sorry            3.85
7997  today  we americans celebrate our independence...            0.00
7998  how to keep the flies off the bride at an ital...            3.00
7999   each ounce of sunflower seeds gives you 37  o...            0.00

[8000 rows x 2 columns]


#### Removing stopwords

In [45]:
from nltk.corpus import stopwords

def remove_stopwords(text):  
    text = text.split()
    stops = set(stopwords.words("english"))
    text = [w for w in text if not w in stops]
    text = " ".join(text)
    return text

processed_data["text"] = list(map(remove_stopwords, processed_data.text))
print(processed_data)

                                                   text  offense_rating
0     tennessee best state nobody even comes close e...            0.20
1     man inserted advertisement classifieds wife wa...            1.10
2     many men take open beer none open time brings ...            2.40
3     told mom hit 1200 twitter followers pointed br...            0.00
4     roses dead love fake weddings basically funera...            0.10
...                                                 ...             ...
7995  lack awareness pervasiveness racism society pr...            0.25
7996                          aspirins white work sorry            3.85
7997  today americans celebrate independence britain...            0.00
7998  keep flies bride italian wedding keep bucket s...            3.00
7999  ounce sunflower seeds gives 37 daily need vita...            0.00

[8000 rows x 2 columns]


#### Tokenizing text

In [46]:
import nltk

def tokenize(text):
    text =  nltk.WordPunctTokenizer().tokenize(text)
    return text

processed_data["text"] = list(map(tokenize, processed_data.text))

print(processed_data)

                                                   text  offense_rating
0     [tennessee, best, state, nobody, even, comes, ...            0.20
1     [man, inserted, advertisement, classifieds, wi...            1.10
2     [many, men, take, open, beer, none, open, time...            2.40
3     [told, mom, hit, 1200, twitter, followers, poi...            0.00
4     [roses, dead, love, fake, weddings, basically,...            0.10
...                                                 ...             ...
7995  [lack, awareness, pervasiveness, racism, socie...            0.25
7996                     [aspirins, white, work, sorry]            3.85
7997  [today, americans, celebrate, independence, br...            0.00
7998  [keep, flies, bride, italian, wedding, keep, b...            3.00
7999  [ounce, sunflower, seeds, gives, 37, daily, ne...            0.00

[8000 rows x 2 columns]


### Splitting data into train and test data

In [47]:
from sklearn.model_selection import train_test_split
percentage = 0.8
data_train, data_test = train_test_split(processed_data, train_size = percentage)

data_train_lem = data_train.copy()
data_test_lem = data_test.copy()

data_train_stem = data_train.copy()
data_test_stem = data_test.copy()

#### Lemmatizing:

In [48]:
# Lemmatizing

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

def lemmatized_words(dataset):
    lemm = nltk.stem.WordNetLemmatizer()
    dataset['text'] = list(map(lambda word:
                                     list(map(lemm.lemmatize, word)),
                                     dataset.text))
    
lemmatized_words(data_train_lem)
lemmatized_words(data_test_lem)

def join_tokens(text):
    text = " ".join(text)
    text.strip()
    return text

data_train_lem["text"] = list(map(join_tokens, data_train_lem.text))
data_train_lem["text"] = data_train_lem["text"].str.strip()

data_test_lem["text"] = list(map(join_tokens, data_test_lem.text))
data_test_lem["text"] = data_test_lem["text"].str.strip()

print(data_train_lem)
print(data_test_lem)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Tommy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


                                                   text  offense_rating
79    wife disappointed find nickname college love m...            0.35
3931  total amount money owed every person country w...            0.00
3959     ex wife deaf left deaf friend honest seen sign            0.40
489   buy whitening toothpaste say guaranteed whiten...            1.75
7339  2016 dubai man mohamed basheer bought lottery ...            0.30
...                                                 ...             ...
2639  choose family take fate hand like love underst...            0.00
1523  great day talking adoption saturday national a...            0.15
2784  I collecting flood victim india said woman sto...            1.10
6227             ellen pompeo also jimmy kimmel tonight            0.00
4618  girlfriend told ever cheated would worse black...            3.65

[6400 rows x 2 columns]
                                                   text  offense_rating
1364  speak repeat already know listen 

#### Stemming:

In [49]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    text = ' '.join([ps.stem(w) for w in text])
    return text

data_train_stem["text"] = list(map(stem, data_train_stem.text))
data_train_stem["text"] = data_train_stem["text"].str.strip()

data_test_stem["text"] = list(map(stem, data_test_stem.text))
data_test_stem["text"] = data_test_stem["text"].str.strip()

print(data_train_stem)
print(data_test_stem)

                                                   text  offense_rating
79    wife disappoint find nicknam colleg love machi...            0.35
3931  total amount money owe everi person countri wo...            0.00
3959     ex wife deaf left deaf friend honest seen sign            0.40
489   buy whiten toothpast say guarante white 14 day...            1.75
7339  2016 dubai man moham basheer bought lotteri ti...            0.30
...                                                 ...             ...
2639  choos famili take fate hand like love understa...            0.00
1523  great day talk adopt saturday nation adopt day ❤️            0.15
2784  I collect flood victim india said woman stood ...            1.10
6227             ellen pompeo also jimmi kimmel tonight            0.00
4618  girlfriend told ever cheat would wors black gi...            3.65

[6400 rows x 2 columns]
                                                   text  offense_rating
1364  speak repeat alreadi know listen 

## Using an SVR algorithm

In [50]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVR
import time

# This is already using the "optimal" parameters
svr_lem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')),
                          ('tfidf', TfidfTransformer(use_idf = False)), ('svr', SVR(kernel = "rbf", 
                            C = 1000.0, shrinking = True, gamma = "scale"))])

start_time = time.time()

svr_lem_model = svr_lem_model.fit(data_train_lem.text, data_train_lem.offense_rating)

print("Training time for the lemmatied text: %s seconds." % (time.time() - start_time))

# =================================================================================================

# This is already using the "optimal" parameters
svr_stem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')), 
                           ('tfidf', TfidfTransformer(use_idf = False)), ('svr', SVR(kernel = "rbf",
                            C = 1000.0, shrinking = True, gamma = "scale"))])

start_time = time.time()

svr_stem_model = svr_stem_model.fit(data_train_stem.text, data_train_stem.offense_rating)

print("Training time for the stemmed text: %s seconds." % (time.time() - start_time))


Training time for the lemmatied text: 3.399998664855957 seconds.
Training time for the stemmed text: 3.4740023612976074 seconds.


### Grid search to determine best parameters
#### Parameters for the lemmatization model
This takes a while to run so here are the best parameters for the model used with lemmatized text:
- svr__gamma: 'scale'
- svr__kernel: 'rbf'
- svr__shrinking: True
- tfidf__use_idf: False
- vect__ngram_range: (1, 1)

In [None]:
from sklearn.model_selection import GridSearchCV
parameters_svr = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'svr__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
                   'svr__shrinking': (True, False),
                   'svr__gamma': ("scale", "auto"),
}
gs_svr = GridSearchCV(svr_lem_model, parameters_svr, n_jobs=-1)
gs_svr = gs_svr.fit(data_train_lem.text, data_train_lem.offense_rating)
gs_svr.best_score_
gs_svr.best_params_

#### Parameters for the stemming model
This takes a while to run so here are the best parameters for the model used with stemmed text:
- svr__gamma: 'scale'
- svr__kernel: 'rbf'
- svr__shrinking: True
- tfidf__use_idf: False
- vect__ngram_range: (1, 1)

In [None]:
parameters_svr = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'svr__kernel': ("linear", "poly", "rbf", "sigmoid", "precomputed"),
                   'svr__shrinking': (True, False),
                   'svr__gamma': ("scale", "auto"),
}
gs_svr = GridSearchCV(svr_stem_model, parameters_svr, n_jobs=-1)
gs_svr = gs_svr.fit(data_train_stem.text, data_train_stem.offense_rating)
gs_svr.best_score_
gs_svr.best_params_

### Metrics for the SVR

#### MSE, RMSE and MAE for the SVR model trained with lemmatized text

In [27]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
predicted = svr_lem_model.predict(data_test_lem.text)
true = data_test_lem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))
print("Mean absolute error: %s." % mean_absolute_error(true, predicted))
print("R squared: %s." % r2_score(true, predicted))

Mean squared error: 0.49486299501846787.
Root mean squared error: 0.7034649920347621.
Mean absolute error: 0.4915382667113362.
R squared: 0.5003779094037234.


#### MSE and RMSE for the SVR model trained with stemmed text

In [28]:
predicted = svr_stem_model.predict(data_test_stem.text)
true = data_test_stem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))
print("Mean absolute error: %s." %mean_absolute_error(true, predicted))
print("R squared: %s." % r2_score(true, predicted))

Mean squared error: 0.4932584092612884.
Root mean squared error: 0.7023235787450741.
Mean absolute error: 0.4904988812054751.
R squared: 0.5019979264561464.


## KNN

In [57]:
from sklearn.neighbors import KNeighborsRegressor

# This is already using the "optimal" parameters
knn_lem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')),
                          ('tfidf', TfidfTransformer(use_idf = True)), ('knn', KNeighborsRegressor(n_neighbors = 10))])

start_time = time.time()

knn_lem_model = knn_lem_model.fit(data_train_lem.text, data_train_lem.offense_rating)

print("Training time for the lemmatied text: %s seconds." % (time.time() - start_time))

# =================================================================================================

# This is already using the "optimal" parameters
knn_stem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')), 
                           ('tfidf', TfidfTransformer(use_idf = False)), ('knn', KNeighborsRegressor(n_neighbors = 10))])

start_time = time.time()

knn_stem_model = knn_stem_model.fit(data_train_stem.text, data_train_stem.offense_rating)

print("Training time for the stemmed text: %s seconds." % (time.time() - start_time))

Training time for the lemmatied text: 0.0840001106262207 seconds.
Training time for the stemmed text: 0.07299947738647461 seconds.


### Metrics for the KNN

#### MSE, RMSE and MAE for the KNN model trained with lemmatized text

In [58]:
predicted = knn_lem_model.predict(data_test_lem.text)
true = data_test_lem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))
print("Mean absolute error: %s." % mean_absolute_error(true, predicted))
print("R squared: %s." % r2_score(true, predicted))

Mean squared error: 0.66582678125.
Root mean squared error: 0.8159820961577527.
Mean absolute error: 0.53548125.
R squared: 0.3191093312352332.


#### MSE and RMSE for the KNN model trained with stemmed text

In [59]:
predicted = svr_stem_model.predict(data_test_stem.text)
true = data_test_stem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))
print("Mean absolute error: %s." %mean_absolute_error(true, predicted))
print("R squared: %s." % r2_score(true, predicted))

Mean squared error: 0.49192619177096353.
Root mean squared error: 0.7013745017969811.
Mean absolute error: 0.4720961200776073.
R squared: 0.4969443057411769.


## Decision Tree

In [60]:
from sklearn.tree import DecisionTreeRegressor

# This is already using the "optimal" parameters
dt_lem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')),
                          ('tfidf', TfidfTransformer(use_idf = True)), ('dt', DecisionTreeRegressor())])

start_time = time.time()

dt_lem_model = dt_lem_model.fit(data_train_lem.text, data_train_lem.offense_rating)

print("Training time for the lemmatied text: %s seconds." % (time.time() - start_time))

# =================================================================================================

# This is already using the "optimal" parameters
dt_stem_model = Pipeline([('vect', CountVectorizer(ngram_range = (1, 1), stop_words='english')), 
                           ('tfidf', TfidfTransformer(use_idf = False)), ('dt', DecisionTreeRegressor())])

start_time = time.time()

dt_stem_model = dt_stem_model.fit(data_train_stem.text, data_train_stem.offense_rating)

print("Training time for the stemmed text: %s seconds." % (time.time() - start_time))

Training time for the lemmatied text: 2.6610000133514404 seconds.
Training time for the stemmed text: 2.1449999809265137 seconds.


### Metrics for the Decision Tree

#### MSE, RMSE and MAE for the Decision Tree model trained with lemmatized text

In [62]:
predicted = dt_lem_model.predict(data_test_lem.text)
true = data_test_lem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))
print("Mean absolute error: %s." % mean_absolute_error(true, predicted))
print("R squared: %s." % r2_score(true, predicted))

Mean squared error: 0.8497890625.
Root mean squared error: 0.9218400417100572.
Mean absolute error: 0.49140625.
R squared: 0.13098502588264704.


#### MSE and RMSE for the Decision Tree model trained with stemmed text

In [63]:
predicted = dt_stem_model.predict(data_test_stem.text)
true = data_test_stem.offense_rating
print("Mean squared error: %s." % mean_squared_error(true, predicted))
print("Root mean squared error: %s." % mean_squared_error(true, predicted, squared = False))
print("Mean absolute error: %s." %mean_absolute_error(true, predicted))
print("R squared: %s." % r2_score(true, predicted))

Mean squared error: 0.7666249999999999.
Root mean squared error: 0.8755712421042618.
Mean absolute error: 0.4698125.
R squared: 0.21603061991314387.
