### Importing libraries

In [61]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix,f1_score,precision_score,recall_score
wordnet_lemmatizer = WordNetLemmatizer()

In [85]:
df = pd.read_csv("D:\\Projects\\New project\\Tweets.csv")

In [86]:
df.head()

Unnamed: 0,tweet_id,airline_sentiment,airline_sentiment_confidence,negativereason,negativereason_confidence,airline,airline_sentiment_gold,name,negativereason_gold,retweet_count,text,tweet_coord,tweet_created,tweet_location,user_timezone
0,570306133677760513,neutral,1.0,,,Virgin America,,cairdin,,0,@VirginAmerica What @dhepburn said.,,2015-02-24 11:35:52 -0800,,Eastern Time (US & Canada)
1,570301130888122368,positive,0.3486,,0.0,Virgin America,,jnardino,,0,@VirginAmerica plus you've added commercials t...,,2015-02-24 11:15:59 -0800,,Pacific Time (US & Canada)
2,570301083672813571,neutral,0.6837,,,Virgin America,,yvonnalynn,,0,@VirginAmerica I didn't today... Must mean I n...,,2015-02-24 11:15:48 -0800,Lets Play,Central Time (US & Canada)
3,570301031407624196,negative,1.0,Bad Flight,0.7033,Virgin America,,jnardino,,0,@VirginAmerica it's really aggressive to blast...,,2015-02-24 11:15:36 -0800,,Pacific Time (US & Canada)
4,570300817074462722,negative,1.0,Can't Tell,1.0,Virgin America,,jnardino,,0,@VirginAmerica and it's a really big bad thing...,,2015-02-24 11:14:45 -0800,,Pacific Time (US & Canada)


### Removing unnecessary columns from the data sets


In [79]:
# cols_to_remove = ["Unnamed: 0"]

# #Drop the columns
# df.drop(cols_to_remove, axis=1, inplace=True)

# #Save modified data
# df.to_csv("tweets.csv")

### Modified dataset to use for the data preprocessing

In [87]:
df.describe()

Unnamed: 0,tweet_id,airline_sentiment_confidence,negativereason_confidence,retweet_count
count,14640.0,14640.0,10522.0,14640.0
mean,5.692184e+17,0.900169,0.638298,0.08265
std,779111200000000.0,0.16283,0.33044,0.745778
min,5.675883e+17,0.335,0.0,0.0
25%,5.685592e+17,0.6923,0.3606,0.0
50%,5.694779e+17,1.0,0.6706,0.0
75%,5.698905e+17,1.0,1.0,0.0
max,5.703106e+17,1.0,1.0,44.0


In [118]:
def normalizer(tweet):
    # remove URLs
    only_letters = re.sub("http\S+", "", tweet)
    # replace @username with user
    only_letters = re.sub("@\w+", "user", only_letters)
    # remove all symbols except hashtags
    only_letters = re.sub("[^\w\s#]", "", only_letters)
    # convert to lowercase
    only_letters = only_letters.lower()
    only_letters = only_letters.split()
    
    # tokenize the tweet
#     tokens = nltk.word_tokenize(only_letters)
    # remove stopwords
    filtered_result = [word for word in only_letters if word not in stopwords.words('english')]
    # lemmatize words using WordNetLemmatizer
    lemmas = [WordNetLemmatizer().lemmatize(t) for t in filtered_result]
    # join the lemmas back into a string
    lemmas = ' '.join(lemmas)
    return lemmas


In [134]:
normalizer('Movie is okay')

'movie okay'

In [121]:
df = shuffle(df)
y = df['airline_sentiment']
x = df.text.apply(normalizer)

In [122]:
vectorizer = CountVectorizer()
x_vectorized = vectorizer.fit_transform(x)

In [123]:
x_vectorized

<14640x13661 sparse matrix of type '<class 'numpy.int64'>'
	with 143399 stored elements in Compressed Sparse Row format>

### Train the data

In [124]:
train_x,val_x,train_y,val_y = train_test_split(x_vectorized,y)

In [125]:
regressor = LogisticRegression(multi_class='multinomial', solver='newton-cg')
model = regressor.fit(train_x, train_y)

In [126]:
params = {'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }
gs_clf = GridSearchCV(model, params, n_jobs=1, cv=5)
gs_clf = gs_clf.fit(train_x, train_y)
model = gs_clf.best_estimator_
     

In [127]:
y_pred = model.predict(val_x)

_f1 = f1_score(val_y, y_pred, average='micro')
_confusion = confusion_matrix(val_y, y_pred)
__precision = precision_score(val_y, y_pred, average='micro')
_recall = recall_score(val_y, y_pred, average='micro')
_statistics = {'f1_score': _f1,
               'confusion_matrix': _confusion,
               'precision': __precision,
               'recall': _recall
              }

In [128]:
print(_statistics)

{'f1_score': 0.7907103825136612, 'confusion_matrix': array([[2048,  213,   52],
       [ 246,  432,   70],
       [ 108,   77,  414]], dtype=int64), 'precision': 0.7907103825136612, 'recall': 0.7907103825136612}


#### Test some of the outputs

In [129]:
test_feature = vectorizer.transform(['Meat Week Day 3: I am okay'])
model.predict(test_feature)

array(['negative'], dtype=object)

In [136]:

test_feature = vectorizer.transform(['Product  is  okay'])
model.predict(test_feature)

array(['neutral'], dtype=object)

In [131]:
test_feature = vectorizer.transform(['This statue is pretty huge'])
model.predict(test_feature)

array(['positive'], dtype=object)

### Export the model

In [113]:
import pickle


In [132]:
pickl = {'vectorizer': vectorizer,
         'model': model
         }
pickle.dump(pickl, open('models'+".p", "wb"))

# git

This is good