In [None]:
import nltk
import numpy as np
import matplotlib as plt
import pandas as pd
from nltk.tokenize import TweetTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import re
import string
from nltk.corpus import twitter_samples

In [None]:
def process_tweets(tweet):
    stemmer=PorterStemmer()
    stop_words=stopwords.words('english')
#     next 2 lines removes noise
    tweet = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', tweet)
    tweet = re.sub("(@[A-Za-z0-9_]+)","", tweet)
    tokenizer = TweetTokenizer(preserve_case=True, strip_handles=True, reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet)
    clean_tweets=[]
    for word in tweet_tokens:
        if word not in stop_words and word not in string.punctuation:
            stemmed_word = stemmer.stem(word)
            clean_tweets.append(stemmed_word)
    return clean_tweets
            
            
def build_frequency(tweets, ys):
    yslist=np.squeeze(ys).tolist()
    frequency = {}
    for y, tweet in zip(yslist, tweets):
        for word in process_tweets(tweet):
            pair=(word, y)
            if pair in frequency:
                frequency[pair] += 1
            else:
                frequency[pair] =1
    return frequency

In [None]:
positive_tweets = twitter_samples.strings('positive_tweets.json')
negative_tweets = twitter_samples.strings('negative_tweets.json')

# split tweets for training and testing.
positive_tests=positive_tweets[4000:]
positive_train=positive_tweets[:4000]
negative_tests=negative_tweets[4000:]
negative_train=negative_tweets[:4000]

train_x=positive_train + negative_train
test_x= positive_tests + negative_tests
train_y = np.append(np.ones((len(positive_train), 1)), np.zeros((len(negative_train), 1))
, axis=0)
test_y = np.append(np.ones((len(positive_tests),1)), np.zeros((len(negative_tests), 1)), axis=0)


In [None]:
frequency = build_frequency(train_x, train_y)
print(f'{len(frequency.keys())}')

print(train_x[1])
print(process_tweets(train_x[1]))

In [None]:
def sigmoid(z):
    h=1/(1+np.exp(-z))
    return h

def gradientDescent(x, y, theta, alpha, num_iters):
    m = x.shape[0]
    for i in range(0, num_iters):
        z = np.dot(x, theta)
        h= sigmoid(z)
        j= -1./m*(np.dot(y.transpose(), np.log(h)) + np.dot((1-y).transpose(), np.log(1-h)))
        
        theta = theta - (alpha/m)*np.dot(x.transpose(), (h-y))
        j = float(j)
        return j, theta
    
def extract_features(token, frequency):
    word_token = process_tweets(token)
    x = np.zeros((1,3))
    x[0,0]=1

    for word in word_token:
        x[0,1]+=frequency.get((word, 1.0),0)
        x[0,2] +=frequency.get((word, 0.0),0)
        assert(x.shape ==(1,3))
        return x

X = np.zeros((len(train_x),3))
for i in range(len(train_x)):
    X[i, :]=extract_features(train_x[i], frequency)
    
Y = train_y
J, theta = gradientDescent(X,Y, np.zeros((3,1)), 1e-9, 1500)
    

def predict_tweet(tweet, frequency, theta):
    x = extract_features(tweet, frequency)
    y_pred = sigmoid(np.dot(x, theta))
    return y_pred
    

In [None]:
my_tweet = 'I am learning :)'
predict_tweet(my_tweet, frequency, theta)

In [None]:
# custom tweet test

custom_tweet = 'best car'
tweet_prediction=predict_tweet(custom_tweet, frequency, theta)
if tweet_prediction > 0.5:
    print('Positive')
else:
    print('Negative')

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

train_data_filepath="https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv"
test_data_filepath =  "https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv"

train_data = pd.read_csv(train_data_filepath)
test_data = pd.read_csv(test_data_filepath)

vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)

train_vectors = vectorizer.fit_transform(train_data['Content'])
test_vectors = vectorizer.transform(test_data['Content'])

classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_data['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# print(f'training time: {time_linear_train}, prediction time: {time_linear_predict} ')
report  = classification_report(test_data['Label'], prediction_linear, output_dict=True)
print(report['pos'])
print(report['neg'])

In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier


In [None]:
positive_tweets  = twitter_samples.strings('positive_tweets.json')
negative_tweets  = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

train_data_filepath="https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv"
test_data_filepath =  "https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv"

train_data = pd.read_csv(train_data_filepath)
test_data = pd.read_csv(test_data_filepath)

vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)

train_vectors = vectorizer.fit_transform(train_data['Content'])
test_vectors = vectorizer.transform(test_data['Content'])

classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_data['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# print(f'training time: {time_linear_train}, prediction time: {time_linear_predict} ')
report  = classification_report(test_data['Label'], prediction_linear, output_dict=True)
print(report['pos'])
print(report['neg'])

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

train_data_filepath="https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv"
test_data_filepath =  "https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv"

train_data = pd.read_csv(train_data_filepath)
test_data = pd.read_csv(test_data_filepath)

vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)

train_vectors = vectorizer.fit_transform(train_data['Content'])
test_vectors = vectorizer.transform(test_data['Content'])

classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_data['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# print(f'training time: {time_linear_train}, prediction time: {time_linear_predict} ')
report  = classification_report(test_data['Label'], prediction_linear, output_dict=True)
print(report['pos'])
print(report['neg'])

In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier


In [None]:
positive_tweets  = twitter_samples.strings('positive_tweets.json')
negative_tweets  = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')


In [None]:
stop_words=stopwords.words('english')
def remove_noise(tweet_tokens, stop_words=()):
    cleaned_tweets =[]
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        if tag.startswith('NN'):
            pos='n'
        elif tag.startswith('VB'):
            pos='v'
        else:
            pos='a'
        lemmatizer=WordNetLemmatizer()
        token = lemmatizer.lemmatize(token,pos)
        
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tweets.append(token.lower())
    return cleaned_tweets
    
    
print(remove_noise(tweet_tokens[2], stop_words))


In [None]:
print(positive_uncleaned_tweets[13])
print(positive_cleaned_tweets_list[13])

In [None]:
#         convert tokens into dictionary for both positiev and negatice tweets

def model_for_tweets(cleaned_tokens_list):
    for tweets in cleaned_tokens_list:
        yield dict([tweet, True] for tweet in tweets )
        
positive_tweets_model=model_for_tweets(positive_uncleaned_tweets)
negative_tweets_model=model_for_tweets(negative_uncleaned_tweets)


In [None]:
#  split the dataset for training and testing the models

import random
positive_dataset=[(tweet_dict, 'Positive') for tweet_dict in positive_tweets_model]
negative_dataset=[(tweet_dict, 'Negative') for tweet_dict in negative_tweets_model]

dataset=positive_dataset+negative_dataset
random.shuffle(dataset)

train_data=dataset[:7000]
test_data=dataset[7000:]


In [None]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is : ", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

In [None]:
custom_tweet="""
During opening statements in Derek Chauvin's trial in the death of George Floyd, prosecutors showed jurors a bystander video of the former officer kneeling on Floyd's neck.
"""
custom_tokens = remove_noise(nltk.word_tokenize(custom_tweet))
print(classifier.classify(dict([token, True] for token in custom_tokens)))

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

train_data_filepath="https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv"
test_data_filepath =  "https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv"

train_data = pd.read_csv(train_data_filepath)
test_data = pd.read_csv(test_data_filepath)

vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)

train_vectors = vectorizer.fit_transform(train_data['Content'])
test_vectors = vectorizer.transform(test_data['Content'])

classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_data['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# print(f'training time: {time_linear_train}, prediction time: {time_linear_predict} ')
report  = classification_report(test_data['Label'], prediction_linear, output_dict=True)
print(report['pos'])
print(report['neg'])

In [None]:
#  add custom tweet here 
custom_tweet="""
    #add custom tweet here

"""
custom_vector = vectorizer.transform([custom_text])
print(classifier_linear.predict(custom_vector))


In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier


In [None]:
#  add custom tweet here 
custom_tweet="""
    #add custom tweet here

"""
custom_vector = vectorizer.transform([custom_text])
print(classifier_linear.predict(custom_vector))


In [None]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import twitter_samples
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
import re, string
from nltk.corpus import stopwords
from nltk.corpus import twitter_samples
from nltk import classify
from nltk import NaiveBayesClassifier


In [None]:
positive_tweets  = twitter_samples.strings('positive_tweets.json')
negative_tweets  = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json'

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import time
from sklearn import svm
from sklearn.metrics import classification_report

train_data_filepath="https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/train.csv"
test_data_filepath =  "https://raw.githubusercontent.com/Vasistareddy/sentiment_analysis/master/data/test.csv"

train_data = pd.read_csv(train_data_filepath)
test_data = pd.read_csv(test_data_filepath)

vectorizer = TfidfVectorizer(min_df = 5, max_df = 0.8, sublinear_tf = True, use_idf = True)

train_vectors = vectorizer.fit_transform(train_data['Content'])
test_vectors = vectorizer.transform(test_data['Content'])

classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, train_data['Label'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0
time_linear_predict = t2-t1

# print(f'training time: {time_linear_train}, prediction time: {time_linear_predict} ')
report  = classification_report(test_data['Label'], prediction_linear, output_dict=True)
print(report['pos'])
print(report['neg'])

In [None]:
#  add custom tweet here 
custom_tweet="""
    #add custom tweet here

"""
custom_vector = vectorizer.transform([custom_text])
print(classifier_linear.predict(custom_vector))

In [2]:
positive_tweets  = twitter_samples.strings('positive_tweets.json')
negative_tweets  = twitter_samples.strings('negative_tweets.json')
text = twitter_samples.strings('tweets.20150430-223406.json')

LookupError: 
**********************************************************************
  Resource [93mtwitter_samples[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('twitter_samples')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/twitter_samples[0m

  Searched in:
    - 'C:\\Users\\HP-PC/nltk_data'
    - 'c:\\users\\hp-pc\\appdata\\local\\programs\\python\\python39\\nltk_data'
    - 'c:\\users\\hp-pc\\appdata\\local\\programs\\python\\python39\\share\\nltk_data'
    - 'c:\\users\\hp-pc\\appdata\\local\\programs\\python\\python39\\lib\\nltk_data'
    - 'C:\\Users\\HP-PC\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [1]:
# tweet_tokens = twitter_samples.tokenized('positive_tweets.json')
# # tweet_tokens[2]

### step 1: Normalize the data

In [22]:

# print(pos_tag(tweet_tokens[1]))

def lemmatize_sentence(tokens):
    lemmatizer = WordNetLemmatizer()
    lemmatized_sentence = []
    
    for word, tag in pos_tag(tokens):
        if tag.startswith('NN'):
            pos ='n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'
        lemmatized_sentence.append(lemmatizer.lemmatize(word, pos))
    return lemmatized_sentence


print(lemmatize_sentence(tweet_tokens[2]))

['@DespiteOfficial', 'we', 'have', 'a', 'listen', 'last', 'night', ':)', 'As', 'You', 'Bleed', 'be', 'an', 'amazing', 'track', '.', 'When', 'be', 'you', 'in', 'Scotland', '?', '!']


### Step 2: Remove noise

In [39]:

stop_words=stopwords.words('english')
def remove_noise(tweet_tokens, stop_words=()):
    cleaned_tweets =[]
    for token, tag in pos_tag(tweet_tokens):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z0-9_]+)","", token)
        
        if tag.startswith('NN'):
            pos='n'
        elif tag.startswith('VB'):
            pos='v'
        else:
            pos='a'
        lemmatizer=WordNetLemmatizer()
        token = lemmatizer.lemmatize(token,pos)
        
        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tweets.append(token.lower())
    return cleaned_tweets
    
    
print(remove_noise(tweet_tokens[2], stop_words))

['listen', 'last', 'night', ':)', 'bleed', 'amazing', 'track', 'scotland']


In [43]:


positive_uncleaned_tweets=twitter_samples.tokenized('positive_tweets.json')
positive_cleaned_tweets_list=[]
negative_uncleaned_tweets=twitter_samples.tokenized('negative_tweets.json')
negative_cleaned_tweets_list=[]

for words in positive_uncleaned_tweets:
    positive_cleaned_tweets_list.append(remove_noise(words, stop_words ))
    
for words in negative_uncleaned_tweets:
    negative_cleaned_tweets_list.append(remove_noise(words, stop_words ))


In [44]:
print(positive_uncleaned_tweets[13])
print(positive_cleaned_tweets_list[13])

['@jjulieredburn', 'Perfect', ',', 'so', 'you', 'already', 'know', "what's", 'waiting', 'for', 'you', ':)']
['perfect', 'already', 'know', "what's", 'wait', ':)']


### step 3: Prepare data model

In [61]:
#         convert tokens into dictionary for both positiev and negatice tweets

def model_for_tweets(cleaned_tokens_list):
    for tweets in cleaned_tokens_list:
        yield dict([tweet, True] for tweet in tweets )
        
positive_tweets_model=model_for_tweets(positive_uncleaned_tweets)
negative_tweets_model=model_for_tweets(negative_uncleaned_tweets)



In [60]:
#  split the dataset for training and testing the models

import random
positive_dataset=[(tweet_dict, 'Positive') for tweet_dict in positive_tweets_model]
negative_dataset=[(tweet_dict, 'Negative') for tweet_dict in negative_tweets_model]

dataset=positive_dataset+negative_dataset
random.shuffle(dataset)

train_data=dataset[:7000]
test_data=dataset[7000:]

### step 4: Building and testing the model

In [63]:
from nltk import classify
from nltk import NaiveBayesClassifier
classifier = NaiveBayesClassifier.train(train_data)

print("Accuracy is : ", classify.accuracy(classifier, test_data))
print(classifier.show_most_informative_features(10))

Accuracy is :  0.9956666666666667
Most Informative Features
                      :( = True           Negati : Positi =   2065.4 : 1.0
                      :) = True           Positi : Negati =    986.0 : 1.0
                     sad = True           Negati : Positi =     49.8 : 1.0
                   Thank = True           Positi : Negati =     26.3 : 1.0
                  THANKS = True           Negati : Positi =     24.4 : 1.0
                 welcome = True           Positi : Negati =     22.9 : 1.0
                  FOLLOW = True           Negati : Positi =     19.1 : 1.0
                    THAT = True           Negati : Positi =     19.1 : 1.0
                    miss = True           Negati : Positi =     18.5 : 1.0
                    MUCH = True           Negati : Positi =     18.4 : 1.0
None


In [1]:
custom_tweet="""
During opening statements in Derek Chauvin's trial in the death of George Floyd, prosecutors showed jurors a bystander video of the former officer kneeling on Floyd's neck.
"""
custom_tokens = remove_noise(nltk.word_tokenize(custom_tweet))
print(classifier.classify(dict([token, True] for token in custom_tokens)))

NameError: name 'remove_noise' is not defined