In [67]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import re
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
from sklearn.feature_extraction.text import TfidfVectorizer
wordnet_lemmatizer = WordNetLemmatizer()

In [57]:
df = pd.read_csv('Tweets.csv')
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


In [42]:
def normalizer(tweet):
  only_letters = re.sub("[^a-zA-Z]", " ", str(tweet))
  only_letters = only_letters.lower()
  only_letters = only_letters.split()
  filtered_result = [word for word in only_letters if word not in stopwords.words('english')]
  lemmas = [wordnet_lemmatizer.lemmatize(t) for t in filtered_result]
  lemmas = ' '.join(lemmas)
  return lemmas

In [15]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to C:\Users\Keanole
[nltk_data]     Nkwane\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Keanole
[nltk_data]     Nkwane\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to C:\Users\Keanole
[nltk_data]     Nkwane\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\omw-1.4.zip.


True

In [43]:
df = shuffle(df)
y = df['Tweets']
x = df.apply(normalizer)

In [44]:
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)

In [47]:
y.shape
x.shape

In [58]:
vectorizer = TfidfVectorizer (max_features=2500, min_df=7, max_df=0.8)
features = df['text']
features = vectorizer.fit_transform(features).toarray()

In [60]:
labels = df['airline_sentiment']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=0)


In [64]:
text_classifier = RandomForestClassifier(n_estimators=200, random_state=0)
text_classifier.fit(X_train, y_train)

RandomForestClassifier(n_estimators=200, random_state=0)

In [66]:
predictions = text_classifier.predict(X_test)


In [68]:
print(confusion_matrix(y_test,predictions))
print(classification_report(y_test,predictions))
print(accuracy_score(y_test, predictions))

[[1778   76   16]
 [ 332  255   27]
 [ 183   62  199]]
              precision    recall  f1-score   support

    negative       0.78      0.95      0.85      1870
     neutral       0.65      0.42      0.51       614
    positive       0.82      0.45      0.58       444

    accuracy                           0.76      2928
   macro avg       0.75      0.60      0.65      2928
weighted avg       0.76      0.76      0.74      2928

0.7622950819672131


In [69]:
import pickle

In [71]:
pickl = {'vectorizer': vectorizer,
         'model': text_classifier
         }
pickle.dump(pickl, open('models'+".p", "wb"))