### Implement Logistic Classification for classifying tweets / text
Given a tweet we will have to decide whether a tweet is positive and negative

In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import twitter_samples

In [None]:
nltk.download('twitter_samples')

In [None]:
nltk.download('stopwords')

### Load and Analyse the dataset

In [None]:
# load positive tweets
positive_tweets = twitter_samples.strings('positive_tweets.json')
positive_tweets[:3]

In [None]:
# load negative tweets
negative_tweets = twitter_samples.strings('negative_tweets.json')
negative_tweets[:3]

In [None]:
## total number of pos and neg tweets

print(f"Total No. of Positive tweets: {len(positive_tweets)}")
print(f'Total No. of Negative tweets: {len(negative_tweets)}')

In [4]:
## generate a train and test dataset with equal combination of pos and neg tweets
## in total 1000 words, dividing the list of tweets into 8000 train and 2000 test datasets.

train_pos = positive_tweets[:4000]
train_neg = negative_tweets[:4000]

test_pos = positive_tweets[4000:]
test_neg = negative_tweets[4000:]

# combining all of them together

train_data = train_pos + train_neg
test_data = test_pos + test_neg

print(f'Total number of data count train data: {len(train_data)} and test data : {len(test_data)}')

NameError: name 'positive_tweets' is not defined

In [None]:
# creating labels for the datasets
train_label = np.append(np.ones((len(train_pos),1)), np.zeros((len(train_neg),1)), axis=0)
test_label = np.append(np.ones((len(test_pos),1)), np.zeros((len(test_neg),1)), axis=0)

print(f'Shape of Train and Test labels : {train_label.shape} and {test_label.shape}')

### Processing of the data to create word frequencies list

In [None]:
from nltk.corpus import stopwords
import re

def clean_tweet(tweet):
    '''
        clean the tweet to tokenise, remove stop words and stem the words
    '''
    stop_words = stopwords.words('english')
    #print(f'Total stop words in the vocab: {len(stop_words)}')
    
    tweet = re.sub(r'#','',tweet) ## remove the # symbol
    tweet = re.sub(r'https?:\/\/.*[\r\n]*','',tweet) ## remove any hyperlinks
    tweet = re.sub(r'^RT[\s]+','',tweet) ## remove any Retweets (RT)
    
    tokenizer = nltk.tokenize.TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    tweet_token = tokenizer.tokenize(tweet)
    
    tweet_cleaned = []
    
    for word in tweet_token:
        if word not in stop_words:
            tweet_cleaned.append(word)
            
    return tweet_cleaned
    

def build_tweet_frequency(tweets, label):
    '''
        Build a vocab of tweet word frequencies across corpus. 
        @input: Tweets - list of tweets
                label - Array of tweet sentiments
        @output: a dict of (word, label):frequency
    '''
    label_list = np.squeeze(label).tolist()
    
    freq = {}
    
    for t, l in zip(tweets, label_list):
        for word in clean_tweet(t):
            word_pair = (word,l)
            
            if word_pair in freq:
                freq[word_pair] +=1
            else:
                freq[word_pair] =1

    return freq
    

In [None]:
train_data[0] ## 0, 500

In [None]:
clean_tweet(train_data[0])

In [None]:
tweet_freq_vocab = build_tweet_frequency(train_data, train_label)

In [None]:
tweet_freq_vocab.get(('sad',0))

In [None]:
def extract_features(tweet, vocab):
    '''
        Given a tweet and frequency vocab, generate a list of 
        @input: 
            tweet - tweet we want to extract features from
            vocab - frequency vocab dictionary
        @output:
            tweet_feature - a numpy array with [label, total_pos_freq, total_neg_freq]
    '''
    cleaned_tweet = clean_tweet(tweet)
    #print(cleaned_tweet)
    tweet_feature = np.zeros((1,3))
    
    tweet_feature[0,0] = 1
    
    for words in cleaned_tweet: # iterate over the tweet to get the number of pos and neg tweet freqs
        #print(vocab.get((words,1.0),0), " --- ", vocab.get((words,0.0),0))
        tweet_feature[0,1] += vocab.get((words,1.0),0)
        tweet_feature[0,2] += vocab.get((words,0.0),0)
    
    return tweet_feature

In [None]:
extract_features(train_data[0],tweet_freq_vocab)

In [None]:
extract_features('Hi How are you? I am doing good', tweet_freq_vocab)

### Model Training

In [None]:
## Generate the vector word frequency for all of the training tweets

train_X = np.zeros((len(train_data),3))
for i in range(len(train_data)):
    train_X[i,:] = extract_features(train_data[i], tweet_freq_vocab)

train_y = train_label

test_X = np.zeros((len(test_data),3))
for i in range(len(test_data)):
    test_X[i,:] = extract_features(test_data[i], tweet_freq_vocab)
    
test_y = test_label

In [None]:
train_X[0:5]

In [None]:
train_y.shape

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='liblinear')
model.fit(train_X, train_y)

In [None]:
predictions = model.predict(test_X)

In [None]:
from sklearn.metrics import accuracy_score

accuracy_score(test_y, predictions)

In [None]:
from sklearn.metrics import classification_report

print(classification_report(test_y,predictions))

### Making your own predictions

In [None]:
my_tweet1 = 'i liked my prediction score. happy with the results'
model.predict(extract_features(my_tweet1,tweet_freq_vocab))

In [None]:
my_tweet2 = 'i am sad with the result of the football match'
model.predict(extract_features(my_tweet2,tweet_freq_vocab))

In [None]:
my_tweet3 = 'shame that i couldnt get an entry to the competition'
model.predict(extract_features(my_tweet3,tweet_freq_vocab))

In [None]:
my_tweet3 = 'this movie should have been great.'
model.predict(extract_features(my_tweet3,tweet_freq_vocab)) ## misclassified example

In [None]:
my_tweet3 = 'i liked my prediction score. not happy with the results'
model.predict(extract_features(my_tweet3,tweet_freq_vocab))

In [None]:
my_tweet4 = 'My boss is a true genius'
model.predict(extract_features(my_tweet4,tweet_freq_vocab))

In [None]:
my_tweet5 = 'I have the greatest boss in the world'
model.predict(extract_features(my_tweet5,tweet_freq_vocab))