In [1]:

%matplotlib  inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re


from nltk.tokenize import WordPunctTokenizer
from nltk.corpus import stopwords

from collections import Counter

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

For this challenge, you will need to choose a corpus of data from nltk or another source that includes categories you can predict and create an analysis pipeline that includes the following steps:

- Data cleaning / processing / language parsing 
- Create features using two different NLP methods: For example, BoW vs tf-idf.
- Use the features to fit supervised learning models for each feature set to predict the category outcomes.
- Assess your models using cross-validation and determine whether one model performed better.
- Pick one of the models and try to increase accuracy by at least 5 percentage points.


Write up your report in a Jupyter notebook. Be sure to explicitly justify the choices you make throughout, and submit it below.

In [2]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
#loading the data
raw_data = pd.read_csv('C:\\Users\\david\Desktop\\thinkful datasets\\Tweets.csv')

In [4]:
#examining data
raw_data.head(5)

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [5]:
#Keeping only the columns we need
raw_data = raw_data[['airline_sentiment', 'text']]

In [6]:
raw_data.head(5)

Unnamed: 0,airline_sentiment,text
0,neutral,@VirginAmerica What @dhepburn said.
1,positive,@VirginAmerica plus you've added commercials t...
2,neutral,@VirginAmerica I didn't today... Must mean I n...
3,negative,@VirginAmerica it's really aggressive to blast...
4,negative,@VirginAmerica and it's a really big bad thing...


In [7]:
raw_data.shape

(14640, 2)

In [8]:
def text_cleaner(raw_text):

    # keep only words remove @ in front of user names
    letters_only_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",raw_text).split())
    #re.sub("[^a-zA-Z]", " ", str(raw_text))

    # convert to lower case and split 
    words = letters_only_text.lower().split()

    # remove stopwords
    stopword_set = set(stopwords.words("english"))
    meaningful_words = [w for w in words if w not in stopword_set]

    # join the cleaned words in a list
    cleaned_word_list = " ".join(meaningful_words)
    #cleaned_word_list = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",raw_text).split())

    return cleaned_word_list

In [9]:
for i in range(len(raw_data)):
                                                                     
    raw_data['text'][i] = text_cleaner(raw_data['text'][i])

In [10]:
raw_data.head(5)

Unnamed: 0,airline_sentiment,text
0,neutral,said
1,positive,plus added commercials experience tacky
2,neutral,today must mean need take another trip
3,negative,really aggressive blast obnoxious entertainmen...
4,negative,really big bad thing


In [11]:
print(raw_data.isnull().sum())

airline_sentiment    0
text                 0
dtype: int64


In [12]:
from sklearn.preprocessing import LabelEncoder
lb_make = LabelEncoder()
raw_data['airline_sentiment'] = lb_make.fit_transform(raw_data['airline_sentiment'])

In [13]:
X = raw_data[["text"]]
y = raw_data[["airline_sentiment"]]

In [14]:
y

Unnamed: 0,airline_sentiment
0,1
1,2
2,1
3,0
4,0
5,0
6,2
7,1
8,2
9,2


In [15]:
#Coding the the sentiment makes 0 = negative, 1 = Neutral, 2 = Positive

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect = CountVectorizer()

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y.values.ravel(), test_size=0.33, random_state=42)

In [26]:
train_data_bow = count_vect.fit_transform(X_train['text'])
test_data_bow = count_vect.transform(X_test['text'])

In [27]:
tfid_vect = TfidfVectorizer()

In [28]:
train_data_tfid = tfid_vect.fit_transform(X_train['text'])

test_data_tfid = tfid_vect.transform(X_test['text'])

In [29]:
train_data_tfid.shape

(9808, 10505)

In [31]:
from sklearn import ensemble
from sklearn.model_selection import cross_val_score
rfc = ensemble.RandomForestClassifier()
cross_val_score(rfc, train_data_bow, y_train, cv=10)

array([0.73319756, 0.72912424, 0.75050916, 0.73496432, 0.72884811,
       0.73061224, 0.74081633, 0.73571429, 0.72755102, 0.73367347])

In [33]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
cross_val_score(lr, train_data_bow, y_train, cv=10)

array([0.7586558 , 0.77189409, 0.78818737, 0.7706422 , 0.77166157,
       0.77244898, 0.76428571, 0.78469388, 0.76530612, 0.76734694])

In [34]:
params = {'n_estimators': 500,
          'max_depth': 2,
          'loss': 'deviance'}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
out = cross_val_score(clf, train_data_bow, y_train, cv=10)
print(np.mean(out))
out

0.7283805986379306


array([0.72505092, 0.73930754, 0.73523422, 0.73190622, 0.73904179,
       0.72653061, 0.72959184, 0.71836735, 0.70102041, 0.7377551 ])

In [38]:
rfc_tfid = ensemble.RandomForestClassifier()
cross_val_score(rfc_tfid, train_data_tfid, y_train, cv=10)

array([0.72606925, 0.72403259, 0.75661914, 0.73088685, 0.72579001,
       0.72857143, 0.73673469, 0.73061224, 0.71938776, 0.71836735])

In [39]:
lr = LogisticRegression()
cross_val_score(lr, train_data_tfid, y_train, cv=10)

array([0.73828921, 0.75356415, 0.77189409, 0.74413863, 0.75840979,
       0.7622449 , 0.7622449 , 0.76632653, 0.73673469, 0.75204082])