## Twitter Sentiment Project

In [2]:
import pandas as pd
import numpy as np
import re

In [3]:
def import_tweets(filename, header = None):
	#import data from csv file via pandas library
	tweet_dataset = pd.read_csv(filename,encoding = "ISO-8859-1",header=header)
	#the column names are based on sentiment140 dataset provided on kaggle
	tweet_dataset.columns = ['sentiment','id','date','flag','user','text']
	#delete 3 columns: flags,id,user, as they are not required for analysis
	for i in ['flag','id','user','date']: del tweet_dataset[i] # or tweet_dataset = tweet_dataset.drop(["id","user","date","user"], axis = 1)
	#in sentiment140 dataset, positive = 4, negative = 0; So we change positive to 1
	tweet_dataset.sentiment = tweet_dataset.sentiment.replace(4,1)
	return tweet_dataset

In [4]:
def preprocess_tweet(tweet):
	#Preprocess the text in a single tweet
	#arguments: tweet = a single tweet in form of string 
	#convert the tweet to lower case
	tweet.lower()
	#convert all urls to sting "URL"
	tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
	#convert all @username to "AT_USER"
	tweet = re.sub('@[^\s]+','AT_USER', tweet)
	#correct all multiple white spaces to a single white space
	tweet = re.sub('[\s]+', ' ', tweet)
	#convert "#topic" to just "topic"
	tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
	return tweet

In [5]:
dataset = import_tweets(r"G:\Coding Blocks\Documents\twitter.csv")

In [6]:
dataset.head(n=5)

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [7]:
dataset["text"]=dataset["text"].apply(preprocess_tweet)

In [8]:
labels = np.array(dataset["sentiment"])
data = np.array(dataset["text"])

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv= TfidfVectorizer(sublinear_tf=True, stop_words = "english") # we need to give proper stopwords list for better performance
features=tfv.fit_transform(data)

In [10]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()

In [11]:
model.fit(features,labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [12]:
def findpolarity(model):
    tweets = input("Enter the Tweet\n")
    tweet = preprocess_tweet(tweets)
    tweet = np.array([tweet])
    t= tfv.transform(tweet)
    p = model.predict(t)[0]
    if p == 0 :
        print("Don't enter sad tweets ! Atleast not on Twitter !?")
    else:
        print("Thats a positive tweet right there on twitter!")

In [13]:
y_p = model.predict(features)

In [14]:
acc = np.sum(y_p == labels)

In [15]:
acc = acc/len(labels)

In [16]:
print(acc)

0.79583125


In [18]:
findpolarity(model = model)

Enter the Tweet
i am really hopeless
Don't enter sad tweets ! Atleast not on Twitter !?


In [19]:
from sklearn.linear_model import LogisticRegression
model2 = LogisticRegression(C=1.)

In [20]:
model2.fit(features,labels)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [21]:
y_p2 = model2.predict(features)

In [22]:
acc2 = np.sum(y_p2 == labels)

In [23]:
acc2 = acc2/len(labels)

In [24]:
print(acc2)

0.801530625


In [28]:
findpolarity(model = model2)

Enter the Tweet
what is happening with the world!
Don't enter sad tweets ! Atleast not on Twitter !?
