# loading the dataset

In [2]:
import pandas as pd

In [3]:
def remove_airline_nameTag_and_links(tweets):
    cleaned_tweets = []
    for tweet in tweets:
        tweet_words = tweet.split()
        
        for word in tweet_words:
            if(word.startswith('@')):
                tweet_words.remove(word)
            if(word.startswith('http')):
                tweet_words.remove(word)
                
        tweet_sent = ' '.join(word for word in tweet_words)
        cleaned_tweets.append(tweet_sent)
    
    return cleaned_tweets        

In [4]:
import emoji
def remove_emoji(tweets):
    cleaned_tweet = []
    
    for tweet in tweets:
        tweet_sent = "".join(char for char in tweet if((char not in emoji.UNICODE_EMOJI) and (char < '0' or char > '9')))
        cleaned_tweet.append(tweet_sent)
    
    return cleaned_tweet

In [5]:
import numpy as np
df1 = pd.read_excel('emoji_sentiment_data.xlsx')
emoji_sentiment = {}
index = 0
for unicode in df1['Unicode codepoint']:
    
    sentiment = np.array([df1['Negative'].iloc[index],
                              df1['Neutral'].iloc[index], 
                              df1['Positive'].iloc[index]])
    max_senti_pos = sentiment.argmax()
    if(max_senti_pos == 0):
        emoji_sentiment[unicode] = 'sad'
    elif(max_senti_pos == 1):
        emoji_sentiment[unicode] = 'neutral'
    elif(max_senti_pos == 2):
        emoji_sentiment[unicode] = 'happy'
    
    index+=1

In [6]:
import numpy as np
def replace_emoji(tweets):
    cleaned_tweet = []
    replaced_sentiment = []
    
    for tweet in tweets:
        tweet_sent = " ".join(emoji_sentiment[hex(ord(char))] for char in tweet if(char in emoji.UNICODE_EMOJI and (hex(ord(char)) in emoji_sentiment)))
        replaced_sentiment.append(tweet_sent)
    
    remove_emoji_tweets = remove_emoji(tweets)
    
    total_tweets = len(remove_emoji_tweets)
    
    for tweet_no in range(total_tweets):
        cleaned_tweet.append(remove_emoji_tweets[tweet_no] + replaced_sentiment[tweet_no])
    
    
    return cleaned_tweet

In [7]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

def simple_posTag(tag):
    if(tag.startswith('J')):
        return wordnet.ADJ
    elif(tag.startswith('V')):
        return wordnet.VERB
    elif(tag.startswith('N')):
        return wordnet.NOUN
    elif(tag.startswith('R')):
        return wordnet.ADV
    else:
        return wordnet.NOUN
        

def remove_stopwords_and_lemmatization_with_posTag(tweets):
    stopWords = stopwords.words('english') + list(string.punctuation)

    tweets_lemmatized = []
    
    for tweet in tweets:
        tweet_words = word_tokenize(tweet.lower())

        lemmatizer = WordNetLemmatizer()
        lemmatize_tweet = [lemmatizer.lemmatize(word,pos=simple_posTag(pos_tag(word)[0][1])) for word in tweet_words if(word not in stopWords)]
        tweets_lemmatized.append(" ".join(lemmatize_tweet))
    return tweets_lemmatized

In [8]:
def building_dataset_with_emogi(tweets, count_vec):
    tweets_without_nameTag_and_links = remove_airline_nameTag_and_links(tweets)

    tweets_emoji_replace = replace_emoji(tweets_without_nameTag_and_links)
    
    lemmatized_tweets_with_emoji = remove_stopwords_and_lemmatization_with_posTag(tweets_emoji_replace)
    
    dataset = count_vec.fit_transform(lemmatized_tweets_with_emoji).todense()
    
    features = count_vec.get_feature_names()
    
    return features, dataset

In [9]:
def building_dataset_without_emogi(tweets, count_vec):
    tweets_without_nameTag_and_links = remove_airline_nameTag_and_links(tweets)

    tweets_without_emoji = remove_emoji(tweets_without_nameTag_and_links)
    
    lemmatized_tweets_without_emoji = remove_stopwords_and_lemmatization_with_posTag(tweets_without_emoji)
    
    dataset = count_vec.fit_transform(lemmatized_tweets_without_emoji).todense()
    
    features = count_vec.get_feature_names()
    
    return features, dataset

In [10]:
import pandas as pd
df = pd.read_csv('training_twitter_x_y_train.csv')
tweets = df['text']
sentiments = df['airline_sentiment']


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

count_vec1 = CountVectorizer(max_features = 3000, ngram_range = (1,3))

features_emoji, train_dataset_emoji = building_dataset_with_emogi(tweets, count_vec1)

features_without_emoji, train_dataset_without_emoji = building_dataset_without_emogi(tweets, count_vec1)

In [12]:
train_dataset_emoji.shape, train_dataset_without_emoji.shape

((10980, 3000), (10980, 3000))

In [13]:
from sklearn.model_selection import train_test_split

x_train_emoji, x_test_emoji, y_train_emoji, y_test_emoji = train_test_split(train_dataset_emoji, sentiments, random_state = 1234)

x_train_without_emoji, x_test_without_emoji, y_train_without_emoji, y_test_without_emoji = train_test_split(train_dataset_without_emoji, sentiments, random_state = 1234)

In [14]:
def get_n_components(data):
    pca = PCA()

    data_transform = pca.fit_transform(data)
    
    n_components = 0
    total_variance = sum(pca.explained_variance_)
    current_variance = 0
    while(current_variance/total_variance <= 0.99):
        current_variance += pca.explained_variance_[n_components]
        n_components += 1
    
    
    return n_components

In [15]:
from sklearn.decomposition import PCA

pca_emoji = PCA(n_components = get_n_components(x_train_emoji), whiten = True)

x_train_emoji_pca = pca_emoji.fit_transform(x_train_emoji)

x_test_emoji_pca = pca_emoji.transform(x_test_emoji)

In [18]:
from sklearn.svm import SVC
#from sklearn.model_selection import GridSearchCV

svm_clf = SVC()
#grid = {"C" : [10**i for i in range(4)], "gamma" : [10**i for i in range(-2,3)]}
#best_svc = GridSearchCV(svm_clf, grid)
#best_svc.fit(x_train_emoji,sentiments)
#best_svc.best_estimator_
svm_clf.fit(x_train_emoji_pca, y_train_emoji)
svm_clf.score(x_train_emoji_pca, y_train_emoji)

0.9162112932604736

In [20]:
svm_clf.score(x_test_emoji_pca, y_test_emoji)

0.7114754098360656

In [16]:
from sklearn.decomposition import PCA

pca_without_emoji = PCA(n_components = get_n_components(x_train_without_emoji), whiten = True)

x_train_without_emoji_pca = pca_without_emoji.fit_transform(x_train_without_emoji)

x_test_without_emoji_pca = pca_without_emoji.transform(x_test_without_emoji)

In [21]:
svm_clf1 = SVC()
#grid = {"C" : [10**i for i in range(4)], "gamma" : [10**i for i in range(-2,3)]}
#best_svc = GridSearchCV(svm_clf, grid)
#best_svc.fit(x_train_emoji,sentiments)
#best_svc.best_estimator_
svm_clf1.fit(x_train_without_emoji_pca, y_train_without_emoji)
svm_clf1.score(x_train_without_emoji_pca, y_train_without_emoji)

0.9164541590771099

In [None]:
svm_clf1.score(x_test_without_emoji_pca, y_test_without_emoji)