In [1]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
dataset = pd.read_csv(r'../input/Tweets.csv')

In [4]:
dataset.head()

**#return percentage of every columns missing value , cols which have >90% missing values then drop them**

In [5]:
(len(dataset) - dataset.count())/len(dataset)

In [6]:
dataset = dataset.drop(['airline_sentiment_gold','negativereason_gold','tweet_coord'],axis=1)

In [7]:
dataset.head(3)

In [8]:
mood_count=dataset['airline_sentiment'].value_counts()

In [9]:
mood_count

In [10]:
import seaborn as sns
import matplotlib.pyplot as plt

**Graphical representation of airline sentiment :-**

In [11]:
sns.countplot(x='airline_sentiment',data=dataset,order=['negative','neutral','positive'])
plt.show()

**Graphical representation of airline sentiment with airlines:-** 

In [12]:
sns.factorplot(x = 'airline_sentiment',data=dataset,
               order = ['negative','neutral','positive'],kind = 'count',col_wrap=3,col='airline',size=4,aspect=0.6,sharex=False,sharey=False)
plt.show()

In [13]:
dataset['negativereason'].value_counts()

**Graphical representation of negativereason towards airlines:-**

In [14]:
sns.factorplot(x = 'airline',data = dataset,kind = 'count',hue='negativereason',size=12,aspect=.9)
plt.show()

both above and below graph are same and for show the negative comment reason on different airlines

In [15]:
sns.factorplot(x = 'negativereason',data=dataset,kind='count',col='airline',size=9,aspect=.8,col_wrap=2,sharex=False,sharey=False)
plt.show()

**Data Cleaning and Preprocessing :-**

In [16]:
import re
import nltk
import time

In [17]:
start_time = time.time()
#remove words which are starts with @ symbols
dataset['text'] = dataset['text'].map(lambda x:re.sub('@\w*','',str(x)))
#remove special characters except [a-zA-Z]
dataset['text'] = dataset['text'].map(lambda x:re.sub('[^a-zA-Z]',' ',str(x)))
#remove link starts with https
dataset['text'] = dataset['text'].map(lambda x:re.sub('http.*','',str(x)))
end_time = time.time()

In [50]:
#total time consume to filter data
end_time-start_time

In [19]:
dataset['text'].head()

In [20]:
dataset['text'] = dataset['text'].map(lambda x:str(x).lower())

In [21]:
dataset['text'].head(2)

In [22]:
from nltk.corpus import stopwords

In [23]:
corpus = []

* Remove stopwords from comments 
* Not used  PorterStemmer to make words pure

In [24]:
none=dataset['text'].map(lambda x:corpus.append(' '.join([word for word in str(x).strip().split() if not word in set(stopwords.words('english'))])))                                     

In [25]:
corpus[:4]

**Training Part :-**

In [26]:
X = pd.DataFrame(data=corpus,columns=['comment_text'])

In [27]:
X.head()

In [28]:
y = dataset['airline_sentiment'].map({'neutral':1,'negative':-1,'positive':1})

In [29]:
y.head(2)

Split data into Train and Test:-

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

In [32]:
print(X_train.shape,X_test.shape,y_train.shape,y_test.shape)

Use TfidfVectorizer for feature extraction :-

In [33]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
vector = TfidfVectorizer(stop_words='english',sublinear_tf=True,strip_accents='unicode',analyzer='word',token_pattern=r'\w{2,}',ngram_range=(1,1),max_features=30000)
#token_patten #2 for word length greater than 2>=

In [36]:
X_train_word_feature = vector.fit_transform(X_train['comment_text']).toarray()

In [37]:
X_test_word_feature = vector.transform(X_test['comment_text']).toarray()

In [38]:
print(X_train_word_feature.shape,X_test_word_feature.shape)

Model Training :-

In [39]:
from sklearn.linear_model import LogisticRegression

In [40]:
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [41]:
classifier = LogisticRegression()

In [42]:
classifier.fit(X_train_word_feature,y_train)

In [43]:
y_pred = classifier.predict(X_test_word_feature)

In [44]:
cm = confusion_matrix(y_test,y_pred)

In [45]:
acc_score = accuracy_score(y_test,y_pred)

In [46]:
print(classification_report(y_test,y_pred),'\n',cm,'\n',acc_score)

In [48]:
y_pred_prob = classifier.predict_proba(X_train_word_feature)

To determine probability of negative or positive comment :-

In [49]:
y_pred_prob[:5]