In [8]:
import numpy as np
import pandas as pd
import re

In [9]:
def import_tweets(filename, header = None):
    tweet_dataset = pd.read_csv(filename, encoding = 'utf-8', header = header)
    tweet_dataset.columns = ['sentiment','id','date','flag','user','text']
    #deleting flags,id,user, as they are not required for analysis
    for i in ['flag','id','user','date']: del tweet_dataset[i] 
    #in the dataset, positive = 4, negative = 0; So, changing positive to 1
    tweet_dataset.sentiment = tweet_dataset.sentiment.replace(4,1)
    return tweet_dataset

In [10]:
def preprocess_tweet(tweet):
    #convert the tweet to lower case
    tweet.lower()
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #removing hashtags
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

In [13]:
def feature_extraction(data):
    #using tf-idf for vectorizing text data
    from sklearn.feature_extraction.text import TfidfVectorizer
    tfv=TfidfVectorizer(sublinear_tf=True, stop_words = "english")
    features=tfv.fit_transform(data)
    return features

In [26]:
def train_classifier(feature_train, target_train,feature_test, target_test):
    from sklearn.metrics import roc_auc_score 
    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(C=1.)
    #fit model to data
    model.fit(feature_train, target_train)
    #make prediction on the same (train) data
    probability_to_be_positive = model.predict_proba(feature_train)[:,1]
    #check AUC(Area Under the Roc Curve) 
    print ("auc on train data:" , roc_auc_score(target_train, probability_to_be_positive))
    
    #make prediction on the test data
    probability_to_be_positive = model.predict_proba(feature_test)[:,1]
    #check AUC(Area Under the Roc Curve) 
    print ("auc on test data:" , roc_auc_score(target_test, probability_to_be_positive))

In [27]:
#apply the preprocess function for all the tweets in the dataset
tweet_dataset = import_tweets("traintwitter.csv")
tweet_dataset['text'] = tweet_dataset['text'].apply(preprocess_tweet)
data = np.array(tweet_dataset.text)
label = np.array(tweet_dataset.sentiment)
features = feature_extraction(data)


In [28]:
from sklearn.model_selection import train_test_split
#splitting dataset into trainiing and testing
feature_train, feature_test, target_train, target_test = train_test_split(features, label, test_size=0.3, random_state=0)

In [29]:
train_classifier(feature_train, target_train,feature_test, target_test)




auc on train data: 0.8855855352726824
auc on test data: 0.8592795832598463
