# This notebook will explore data gathered from the US Airlines Sentiment Data Set

Summary of Data: Twitter responses towards US Airlines and the emotional state of each party.

Data Information:
Length of data set: 14,640
Columns in the data set:'tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'


In [1]:
# Importing Python modules
import numpy as np
import pandas as pd
import os
import seaborn as sns
import joblib



# Ensure the Jupyter Notebooks fills the web browser
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
# Importing the dataset with pandas

# Changing the working directory to the raw data directory
os.chdir('/home/george/Documents/Insight_DS_TO20A/Projects/EmotionalDetection/data/raw/US_Airline_Sentiment')
data_temp = pd.read_csv('Tweets.csv')

In [3]:
data_temp.columns, data_temp.shape

(Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
        'negativereason', 'negativereason_confidence', 'airline',
        'airline_sentiment_gold', 'name', 'negativereason_gold',
        'retweet_count', 'text', 'tweet_coord', 'tweet_created',
        'tweet_location', 'user_timezone'],
       dtype='object'), (14640, 15))

In [109]:
# Minimalist appraoch to labelling the above data set: 
# Create a binary classification scheme = the headline has a "+" or "-" emotional response.
## Only collect the responses that gave a positive or negative emotional response from the airline

# data_pn = data_temp[data_temp.airline_sentiment != 'neutral'] 
# data_pn.airline_sentiment.unique(), data_pn.shape  

# Droping labels with nan and converting them
#data_temp.negativereason_confidence.dropna(how='all',inplace=True)
#data_temp['negativereason_confidence'] = data_temp.negativereason_confidence.round(0).astype(int)
#data_temp.negativereason_confidence.dropna(how='all',inplace=True)
#data_temp.negativereason_confidence.unique()


# # Gathering the text and labels

text = data_temp.text.copy(deep=True)
labels_confvalues = data_temp.negativereason_confidence
labels_confvalues.unique()
# # Pre-processing labels. Removing "inf" and nan
labels_confvalues.replace([np.inf, -np.inf], np.nan)
labels_confvalues.dropna()

# # Seperating the labels by seperating the confidence labels to be:
# # negative > 0.5, positive is <= 0.5
labels_confvalues.dropna(how='all', inplace=True)
labels = 1-labels_confvalues.round(0).astype(int)
filter_text = labels.index
text = text[filter_text]

# Printing the number of each class:
print("The total number of tweets are:", labels.shape[0])
print("The number (out of 5000) of negative emotional headlines =", labels.shape[0] - labels.sum()) # labeled with 0s
print("The number (out of 5000) of positive emotional headlines =", labels.sum()) # Labelled with 1s
print("The niave classification (1 class fit to all) is =", 1-labels.sum()/len(labels) )

The total number of tweets are: 10522
The number (out of 5000) of negative emotional headlines = 7397
The number (out of 5000) of positive emotional headlines = 3125
The niave classification (1 class fit to all) is = 0.7030032313248431


In [99]:
# Forcing a balance dataset -- NIAVELY REMOVING POSITIVE TWEETS To BALANCE THE DATASET

## Splitting the Data into a Training, Validation, and Test Set
## Fractions are: Training = 80%, Validation = 10%, Test = 10 %

from sklearn.model_selection import train_test_split  


## For the time being, the following code forces the training, validation, and test data sets are balanced
num_pos_labels = labels.sum() # 1418 headlines
num_neg_labels = labels.shape[0] - num_pos_labels # 3582 headlines
num_pos_labels_2drop = num_pos_labels - num_neg_labels # 2164 headlines
print(num_pos_labels)
print(num_neg_labels)
print(num_pos_labels_2drop)
# cnt = 0

index_pos = labels[labels == 1].index
index_neg = labels[labels == 0].index
labels_temp_bal_pos = labels[index_pos[num_pos_labels_2drop:]]
labels_temp_bal_neg = labels[index_neg]

labels_temp_bal = labels_temp_bal_neg.combine(labels_temp_bal_pos, max, fill_value=0)

print("The total length of the label series = ", labels_temp_bal.shape)
print("The total number of positive labels =",labels_temp_bal.sum())
labels_temp_bal.loc[14638], labels.loc[14638]

## Ensuring we use the appropriate text from the balanced labeled series
labels_temp_bal_indices = labels_temp_bal.index
text_bal = text[labels_temp_bal_indices]

# cnt = 0
# for i in range(labels_temp_bal.shape[0]):
#     if labels_temp_bal.iloc[i] == 1 and cnt < (num_pos_labels_2drop+1):
#         labels_temp_bal[i].drop(inplace=True)
#         cnt += 1

# index_Label0 = data_temp_bal.index[data_temp_bal['BinaryEmoLabel'] == 0].tolist()


# Labels = data_temp.BinaryEmoLabel
# X_train, X_test, y_train, y_test = train_test_split(data_temp, Labels, test_size=0.2, random_state=1)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.5, random_state=1)

7397
3125
4272
The total length of the label series =  (6250,)
The total number of positive labels = 3125


## Pre-processing the text / headlines
#### Initially the steps which follow were inspired by the Medium Blog Posts:
#### https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184
#### https://towardsdatascience.com/sentiment-analysis-with-python-part-2-4f71e7bde59a
#### Written By: Aaron Kub

#### https://towardsdatascience.com/twitter-sentiment-analysis-classification-using-nltk-python-fa912578614c
#### Written By: Mohamed Afham ** Follwoing this scheme to begin the process

In [38]:
#Data Analysis
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Data Preprocessing and Feature Engineering
from textblob import TextBlob
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

#Model Selection and Validation
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report,accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline, make_pipeline
sm = SMOTE(random_state=42)

In [22]:
## Combining all the above text pre-processing into one function 

def text_processing(headline):
    
    #Generating the list of words in the headline (hastags and other punctuations removed)
    def form_sentence(headline):
        headline_blob = TextBlob(headline)
        return ' '.join(headline_blob.words)
    new_headline = form_sentence(headline)
    
    #Removing stopwords and words with unusual symbols
    def no_user_alpha(headline):
        headline_list = [ele for ele in headline.split() if ele != 'user']
        clean_tokens = [t for t in headline_list if re.match(r'[^\W\d]*$', t)]
        clean_s = ' '.join(clean_tokens)
        clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
        return clean_mess
    no_punc_headline = no_user_alpha(new_headline)
    
    #Normalizing the words in headlines 
    def normalization(headline_list):
        lem = WordNetLemmatizer()
        normalized_headline = []
        for word in headline_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_text = lem.lemmatize(normalized_text,'a')
            normalized_text = lem.lemmatize(normalized_text,'n')
            normalized_headline.append(normalized_text)
        return normalized_headline
    
    
    return normalization(no_punc_headline)

In [23]:
def Int2EmotionConverter(IntEmo):
    """
    Converts the integer value of the emotion predicted to the emotions word in English
    """
    EmoWords = pd.DataFrame(['Happy','Sad'])
    return EmoWords.iloc[IntEmo][0]

In [96]:
# scikit-learn: Pre-defining a workflow of algorithm (Niave-Bayse Classifier)
pipeline = Pipeline([
    ('bow',CountVectorizer(analyzer=text_processing)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),                      # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])



In [103]:
pipeline_wSMOTE = make_pipeline( CountVectorizer(analyzer=text_processing),
                          TfidfTransformer(),
                          SMOTE(random_state=4) ,
                          MultinomialNB())

In [100]:
##### Training Model #### -- FOR THE MANUALLY BALANCED DATA SET
msg_train, msg_test, label_train, label_test = train_test_split(text_bal, labels_temp_bal, test_size=0.1, random_state=4)#, stratify=labels_temp_bal)
pipeline.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))
# What would the accuracy of the Training Data Set is we niavely set all labels to 0
print("The accuracy of labelling all headlines NEGATIVE is:",1-label_test.sum()/len(label_test))

print("This model's accuracy is better than the Niave assumption by:", accuracy_score(predictions,label_test) - 1+label_test.sum()/len(label_test)) 

              precision    recall  f1-score   support

           0       0.72      0.83      0.77       277
           1       0.84      0.75      0.79       348

    accuracy                           0.78       625
   macro avg       0.78      0.79      0.78       625
weighted avg       0.79      0.78      0.78       625

[[229  48]
 [ 88 260]]
0.7824
The accuracy of labelling all headlines NEGATIVE is: 0.5072
This model's accuracy is better than the Niave assumption by: 0.2752


In [104]:
##### Training Model #### -- FOR THE SMOTE PIPLINE -- ALL DATA!
msg_train, msg_test, label_train, label_test = train_test_split(text, labels, test_size=0.1, random_state=4)# stratify=labels)
print('Size of Training Set =',msg_train.shape[0])
print('Fraction of Neg Train Labels = ', label_train.sum()*1.0 / len(label_train) )
print('Expected Number of Total Training Tweets after SMOTE is applied =', label_train.sum()*2)
pipeline_wSMOTE.fit(msg_train,label_train)
predictions = pipeline.predict(msg_test)
print(classification_report(predictions,label_test))
print(confusion_matrix(predictions,label_test))
print(accuracy_score(predictions,label_test))
# What would the accuracy of the Training Data Set is we niavely set all labels to 0
print("The accuracy of labelling all headlines NEGATIVE is:", max(1-label_test.sum()/len(label_test),label_test.sum()/len(label_test) ))

print("This model's accuracy is better than the Niave assumption by:", accuracy_score(predictions,label_test) - 1+label_test.sum()/len(label_test)) 

Size of Training Set = 9469
Fraction of Neg Train Labels =  0.7047206674411237
Expected Number of Total Training Tweets after SMOTE is applied = 13346
              precision    recall  f1-score   support

           0       0.87      0.47      0.61       601
           1       0.56      0.90      0.69       452

    accuracy                           0.66      1053
   macro avg       0.71      0.69      0.65      1053
weighted avg       0.74      0.66      0.65      1053

[[285 316]
 [ 44 408]]
0.6581196581196581
The accuracy of labelling all headlines NEGATIVE is: 0.6875593542260209
This model's accuracy is better than the Niave assumption by: 0.345679012345679


In [79]:
label_train.sum()*1.0

6673

In [None]:
sum(labels_temp_bal)/len(labels_temp_bal)

In [None]:
print(os.getcwd())

In [None]:
# saving model
import joblib

filename = 'NBC_USAirlines_model_Acc82p40.sav'
joblib.dump(pipeline, filename)

In [None]:
# test loading the model trained
filename = 'NBC_USAirlines_model_Acc83p36.sav'
loaded_pipeline = joblib.load(filename)

loaded_predictions = loaded_pipeline.predict(msg_test)
print(classification_report(loaded_predictions,label_test))
print(confusion_matrix(loaded_predictions,label_test))
print(accuracy_score(loaded_predictions,label_test))
# What would the accuracy of the Training Data Set is we niavely set all labels to 0
print("The accuracy of labelling all headlines NEGATIVE is:",1-label_train.sum()/len(label_train))

print("This model's accuracy is better than the Niave assumption by:", accuracy_score(loaded_predictions,label_test) - 1+label_train.sum()/len(label_train)) 


In [None]:
dummy_index = 90

temp_text = pd.Series([])
temp_text[0] = text.iloc[dummy_index]
type(temp_text)

out_temp = pipeline.predict(temp_text)
print("The sentence input was:\n",temp_text[0])
print("The emotional prediction was:", Int2EmotionConverter(out_temp[0]))

In [None]:
## Randomly print a positive review and a negative review -- MANUAL INSPECTION OF RESULTS

# Collecting all preditions
all_preds = pipeline.predict(text)

In [None]:
# Gathering all indices for "positive" and "negative" reviews
pred_neg_indices = np.where(all_preds == 0)[0] # all_preds[all_preds == 0].index 
pred_pos_indices = np.where(all_preds == 1)[0] # all_preds[all_preds == 1].index

neg_index_ran = np.random.choice(pred_neg_indices,1)[0]
pos_index_ran = np.random.choice(pred_pos_indices,1)[0]
#print(neg_index_ran)
#print(pos_index_ran)
print("THIS IS THE SAD CASE!!")
# Printing the results of the negative text
print("The sentence input was:\n",text.iloc[neg_index_ran])
print("The emotional prediction was:", Int2EmotionConverter(all_preds[neg_index_ran]))
print("________________________")
print("THIS IS THE HAPPY CASE!!")
# Printing the results of the positive text
print("The sentence input was:\n",text.iloc[pos_index_ran])
print("The emotional prediction was:", Int2EmotionConverter(all_preds[pos_index_ran]))

## Now predicting the emotional sentiment (neg., pos.) given a user string

In [None]:
temp_text = pd.Series([])
temp_text[0] = "I'm Sad!"

out_temp = pipeline.predict(temp_text)
print("The sentence input was:\n",temp_text[0])
print("The emotional prediction was:", Int2EmotionConverter(out_temp[0]))

In [None]:
temp_text
out_temp[0]
Int2EmotionConverter(out_temp[0])

## REAL CONVERSATION EXAMPLE CASE ##

In [None]:
# importing the real conversation
os.chdir('/home/george/Documents/Insight_DS_TO20A/Projects/EmotionalDetection/data/raw')
real_conv = pd.read_csv('real_chat.txt',sep="\n", header=None, dtype=str) 
real_conv = real_conv[0] # converts the DataFrame to a DataSeries 
                         # as desired for the label predictions


# Generating the emotional predictions from the above text
emos_real_chat = pipeline.predict(real_conv)

# generating the speaker/writer
speaker = pd.Series([])
for i in range(real_conv.shape[0]):
    speaker[i] = real_conv.iloc[i].split()[0][:-1] 

speaker.unique()

In [None]:
type(speaker), type(emos_real_chat)
chat_df = pd.DataFrame([])
chat_df['speaker'] = speaker
chat_df['emotions'] = emos_real_chat
chat_df.to_csv('chat_df.csv')

In [None]:
chat_df = pd.read_csv('chat_df.csv')#, chat_df.drop(columns=["Unnamed: 0"])

In [None]:
chat_df

In [None]:
#sns.palplot(sns.color_palette("husl", 8)) # 
index_agent = chat_df.index[chat_df['speaker'] == 'CS_Agent' ].tolist()
index_customer = chat_df.index[chat_df['speaker'] == 'Customer' ].tolist()

# fig, ax = plt.subplots()
plt.figure(figsize=(18, 16), dpi= 80, facecolor='w', edgecolor='k')
plt.plot(index_agent,chat_df['emotions'][index_agent],label='agent',lw=7,color='lightgreen')
plt.plot(index_customer,chat_df['emotions'][index_customer],label='customer', lw=7,color='lightblue')

#plt.axvline(x=37,linewidth=10,color='r',label="Intervention")

plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.,prop={'size': 30} )
plt.xlabel('Tweet Number', size=40)
plt.ylabel('Emotion',size=40)
plt.xticks(fontsize=25)
plt.yticks([0,1],labels=['Positive','Negative'],fontsize=40)

#plt.text(10.1,0,'Intervention',rotation=90)
# Changing the y-labels to "Sad" (0) and "Happy" (1)
# labels = [item.get_text() for item in ax.get_yticklabels()]
# labels[0] = 'Sad'
# labels[1] = 'Happy'
plt.xlim(0,10)
plt.savefig("Preliminary_AgentCustomer_Emotional_Output.png", bbox_inches='tight', dpi=100)
plt.plot()
#final_df.plot(x='index',y='emotions')


In [None]:
Customer_Chat = chat_df[chat_df["speaker"] == 'Customer']
CSAgent_Chat  = chat_df[chat_df["speaker"] == 'CS_Agent']
len(Customer_Chat) , len(CSAgent_Chat)

In [None]:
## Just animated 1 plot by examplining the two-graph animation code below
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation

fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
# ax2 = fig.add_subplot(2, 1, 2)

t = np.linspace(1, len(Customer_Chat), len(Customer_Chat))
x = np.array(Customer_Chat["emotions"])
#y = CSAgent_Chat["emotions"]

ax1.set_ylabel('Customer')
ax1.set_xlim(0, len(Customer_Chat))
ax1.set_ylim(-0.05, 1.05)
plt.setp(ax1.get_xticklabels(),visible=True)
plt.yticks([0,1],labels=['Positive','Negative'],fontsize=10)
# ax2.set_xlabel('t')
# ax2.set_ylabel(u'CS_Agent')
# ax2.set_xlim(0, max(chat_df.index))
# ax2.set_ylim(0, 1)

lines = []
for i in range(len(t)):    
    head = i - 1
    head_slice = (t > t[i] - 1.0) & (t < t[i])
    line1,  = ax1.plot(t[:i], x[:i], color='black')
    line1a, = ax1.plot(t[head_slice], x[head_slice], color='red', linewidth=2)
    line1e, = ax1.plot(t[head], x[head], color='red', marker='o', markeredgecolor='r')
    lines.append([line1,line1a,line1e])


# Build the animation using ArtistAnimation function

ani = animation.ArtistAnimation(fig,lines,interval=100,blit=True)

In [None]:
np.linspace(1, len(Customer_Chat), len(Customer_Chat))

In [None]:
## Now creating a new customer and agent conversation arrays which have the same (full conversation) index length

CSChat_Emotions = pd.DataFrame([])
CSChat_Emotions["Customer_Emotions"] = np.zeros([chat_df.shape[0]])
CSChat_Emotions["Agent_Emotions"] = np.zeros([chat_df.shape[0]])
Customer_Previous_Emotion = 0 # We presume both parties start in a positive state
CS_Agent_Previous_Emotion = 0 # ""
for i in range(chat_df.shape[0]):
    current_speaker = chat_df['speaker'].iloc[i]
    print(current_speaker)
    if current_speaker == "Customer":
        CSChat_Emotions["Customer_Emotions"].iloc[i] = chat_df['emotions'].iloc[i]
        CSChat_Emotions["Agent_Emotions"].iloc[i] = CS_Agent_Previous_Emotion
        Customer_Previous_Emotion = chat_df['emotions'].iloc[i]
    #else:
    elif current_speaker == "CS_Agent":
        CSChat_Emotions["Customer_Emotions"].iloc[i] = Customer_Previous_Emotion
        CSChat_Emotions["Agent_Emotions"].iloc[i] = chat_df['emotions'].iloc[i]
        CS_Agent_Previous_Emotion = chat_df['emotions'].iloc[i]
        


In [None]:
t = np.linspace(0, max(CSChat_Emotions.index), max(CSChat_Emotions.index)+1)
t, t.shape, CSChat_Emotions.shape

In [None]:
!brew install imagemagick

In [None]:
## Plotting both the Customer and the Agent in a single animated plot
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
#sns.palplot(sns.color_palette("husl", 8)) # 

fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
ax2 = fig.add_subplot(2, 1, 2)

t = np.linspace(1, len(CSChat_Emotions), len(CSChat_Emotions))
x = CSChat_Emotions["Customer_Emotions"]
y = CSChat_Emotions["Agent_Emotions"]
# print(t.shape)
# print(x.shape)
# print(y.shape)

ax1.set_ylabel(u'Customer')
ax1.yaxis.set_label_position("right")
ax1.set_xlim(0, max(CSChat_Emotions.index))
ax1.set_ylim(-0.05, 1.05)
plt.setp(ax1.get_xticklabels(),visible=False)
ax1.set_yticks([0, 1])
ax1.set_yticklabels(labels=['Postive', 'Negative'])
ax1.set_title('Real-time Emotions: Customer Service Text Conversation', size=15)

ax2.set_xlabel('Text Message Number')
ax2.set_ylabel(u'CS Agent')
ax2.yaxis.set_label_position("right")
ax2.set_xlim(0, max(CSChat_Emotions.index))
ax2.set_ylim(-0.05, 1.05)
ax2.set_yticks([0, 1])
ax2.set_yticklabels(labels=['Postive', 'Negative'])

lines = []
for i in range(1,len(t)): 
#     print(i)
    head = i - 1
    head_slice = (t > t[i] - 1.0) & (t < t[i])
    line1,  = ax1.plot(t[:i], x[:i], color='lightblue')
    line1a, = ax1.plot(t[head_slice], x[head_slice], color='red', linewidth=2)
    line1e, = ax1.plot(t[head], x[head], color='red', marker='o', markeredgecolor='r')
    line2,  = ax2.plot(t[:i], y[:i], color='lightgreen')
    line2a, = ax2.plot(t[head_slice], y[head_slice], color='red', linewidth=2)
    line2e, = ax2.plot(t[head], y[head], color='red', marker='o', markeredgecolor='r')
    lines.append([line1,line1a,line1e,line2,line2a,line2e])


# Build the animation using ArtistAnimation function

ani = animation.ArtistAnimation(fig,lines,interval=125,blit=True)
ani.save('animation.gif', writer='imagemagick', fps=10)

In [None]:
os.getcwd()

In [None]:
#max(CSChat_Emotions.index)
# line1a, = ax1.plot(t[head_slice], x[head_slice], color='red', linewidth=2)
# line1e, = ax1.plot(t[head], x[head], color='red', marker='o', markeredgecolor='r')
# head, t
t = np.linspace(1, len(CSChat_Emotions), len(CSChat_Emotions))
print(t)
for i in t:
    print(i)

# Customer Support on Twitter

In [None]:
## Importing the "Customer Support on Twitter"
os.chdir('/home/george/Documents/Insight_DS_TO20A/Projects/EmotionalDetection/data/raw/CustomerSupportTwitter')
cst_orig = pd.read_csv('twcs.csv')
print(cst_orig.shape)
cst_orig.dropna(inplace=True) # removing the NaN valued rows as we desire a conversation (3 tweets)
                              # which requires all tweet_id place holders to be non-empty
print(cst_orig.shape)

In [None]:
cst_orig.columns, cst_orig.shape

In [None]:
# Now parsing the response_tweet_id and in_reponse_to_tweet_idt
#print(type(cst_orig['response_tweet_id']))
#print(np.fromstring(cst_orig['response_tweet_id'][5], dtype=int, sep=','))

cst_orig_np = cst_orig['response_tweet_id'].to_numpy()

# type(str.split(cst_orig_np[5],','))
# int(str.split(cst_orig_np[5],',')[0])

# list(map(int, cst_orig_np))
print((cst_orig['response_tweet_id'].iloc[5]))
# list(map(int,cst_orig_np[5]))

In [None]:
## identifying conversations -- gathering the unqiue pairs of tweets
tweetID_pairs = pd.DataFrame([])
tweetID_pairs['author_id'] = cst_orig['author_id']
tweetID_pairs['response_tweet_id'] = cst_orig['response_tweet_id']
print("The length of original tweet id-pair dataframe is:", tweetID_pairs.shape[0])

# Generate a numpy array to sort the rows
tweetID_pairs_np = tweetID_pairs.to_numpy(copy=True)
tweetID_pairs_np = np.sort(tweetID_pairs_np)

tweetID_pairsSrtd = pd.DataFrame([])
tweetID_pairsSrtd['ID1'] = tweetID_pairs_np[:,0]
tweetID_pairsSrtd['ID2'] = tweetID_pairs_np[:,1]
tweetID_pairsSrtd.drop_duplicates(inplace=True)
print("The length of the unique tweet pair", tweetID_pairsSrtd.shape[0])

In [None]:
tweetID_pairsSrtd['ID1'].unique().shape, tweetID_pairsSrtd['ID2'].unique().shape

In [None]:
## Connecting twiter conversations together through 'tweet_id', 'response_tweet_id', 'in_response_to_tweet_id'
convos = pd.DataFrame([])

for i in range(cst_orig.shape[0]):
    convos["BaseID"].iloc[i] = cst_orig['tweet_id']
    response_temp = cst_orig['response_tweet_id']
    resp_list = 

In [None]:
## Plotting text to an image

from PIL import Image, ImageDraw

lines = ["In the old #BILGETANK we'll keep you in the know",
         "In the old #BILGETANK we'll fix your techie woes",
         "And we'll make things",
         "And we'll break things",
         "'til we're altogether aching",
         "Then we'll grab a cup of grog down in the old #BILGETANK"]

i_bad = 3 # This will be output by the model -- Negative Index

img = Image.new('RGB', (1024, 1024), color = (240, 255, 240))
font = ImageFont.truetype('/usr/share/fonts/truetype/freefont/FreeSerif.ttf', 32) 
d = ImageDraw.Draw(img)
for i in range(len(lines)):
    message = lines[i]
    if i != i_bad:
        d.text((10,10+36*i), message, fill=(100,20,255), font=font)
    else:
        d.text((10,10+36*i),"*"+message+"*", fill=(255,20,20), font=font)
file_name = "Text_%i" %(i)
print(file_name)
img.save('pil_text.png')

In [None]:
os.getcwd()

## -----------------------The Test Dev. Case is Completed ---------------------------------

# ------------------------------------------------------------------------------------

# APPENDIX: The Individual Functions and Examples cases to showcase how they work


In [None]:
## Punctuation Removal
def form_sentence(headline):
    headline_blob = TextBlob(headline)
    return ' '.join(headline_blob.words)

data_temp['HL_PuncRem_1'] = 'NaN'
for i in range(data_temp.shape[0]):
    data_temp.HL_PuncRem_1.iloc[i] = form_sentence(data_temp['headline'].iloc[i])
    
    

In [None]:
## Removing Stop-Words (e.g: is, are, have)
def no_user_alpha(headline):
    headline_list = [ele for ele in headline.split() if ele != 'user']
    clean_tokens = [t for t in headline_list if re.match(r'[^\W\d]*$', t)]
    clean_s = ' '.join(clean_tokens)
    clean_mess = [word for word in clean_s.split() if word.lower() not in stopwords.words('english')]
    return clean_mess

data_temp['HL_StopWords_2'] = 'NaN'
for i in range(data_temp.shape[0]):
    data_temp.HL_StopWords_2.iloc[i] = no_user_alpha(data_temp['HL_PuncRem_1'].iloc[i])


In [None]:
print(data_temp.HL_StopWords_2.iloc[0])
print(data_temp.headline.iloc[0])

In [None]:
# Normalize Text -- NLTK’s built-in WordNetLemmatizer does this 
def normalization(headline_list):
        lem = WordNetLemmatizer()
        normalized_headline = []
        for word in headline_list:
            normalized_text = lem.lemmatize(word,'v')
            normalized_headline.append(normalized_text)
        return normalized_headline
    
data_temp['HL_Normalize_3'] = 'NaN'
for i in range(data_temp.shape[0]):
    data_temp.HL_Normalize_3.iloc[i] = normalization(data_temp['HL_StopWords_2'].iloc[i])
    

In [None]:
print(data_temp.HL_Normalize_3.iloc[0])
print(data_temp.headline.iloc[0])

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(2, 1, 1)
ax2 = fig.add_subplot(2, 1, 2)

t = np.linspace(0, 10, 500)
x = np.cos(2 * np.pi * t)
y = np.sin(2 * np.pi * t)


ax1.set_ylabel(u'cos(2\u03c0t)')
ax1.set_xlim(0, 10)
ax1.set_ylim(-1, 1)
plt.setp(ax1.get_xticklabels(),visible=False)

ax2.set_xlabel('t')
ax2.set_ylabel(u'sin(2\u03c0t)')
ax2.set_xlim(0, 10)
ax2.set_ylim(-1, 1)

lines = []
for i in range(len(t)):
    head = i - 1
    head_slice = (t > t[i] - 1.0) & (t < t[i])
    line1,  = ax1.plot(t[:i], x[:i], color='black')
    line1a, = ax1.plot(t[head_slice], x[head_slice], color='red', linewidth=2)
    line1e, = ax1.plot(t[head], x[head], color='red', marker='o', markeredgecolor='r')
    line2,  = ax2.plot(t[:i], y[:i], color='black')
    line2a, = ax2.plot(t[head_slice], y[head_slice], color='red', linewidth=2)
    line2e, = ax2.plot(t[head], y[head], color='red', marker='o', markeredgecolor='r')
    lines.append([line1,line1a,line1e,line2,line2a,line2e])


# Build the animation using ArtistAnimation function

ani = animation.ArtistAnimation(fig,lines,interval=50,blit=True)