In [1]:
import numpy as np
import pandas as pd
from nltk.corpus import twitter_samples
import nltk
all_positive_tweets = twitter_samples.strings('positive_tweets.json')
all_negative_tweets = twitter_samples.strings('negative_tweets.json')
train_pos=all_positive_tweets[:4000]
train_neg=all_negative_tweets[:4000]
test_pos=all_positive_tweets[4000:]
test_neg=all_negative_tweets[4000:]

In [2]:
train_x=train_pos+train_neg
test_x=test_pos+test_neg
train_y = np.append(np.ones((len(train_pos), 1)), np.zeros((len(train_neg), 1)), axis=0)
test_y=np.append(np.ones((len(test_pos),1)),np.zeros((len(test_neg),1)),axis=0)

In [3]:
import re 
import string 
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer

In [103]:
def clean_tweets(tweets):
    ct=[]
    for t in tweets:
        t=re.sub(r'@\w*','',t)
        t=re.sub(r'#','',t)
        t=re.sub(r'http\S+','',t)
        t=re.sub(r'[$&\n]\w*','',t)
        t=re.sub(r'[.*]','',t)
#         t=re.sub(r'u','',t)
#         t=re.sub(r'ur','',t)
#         t=re.sub(r'[`^()-]','',t)
        ct.append(t)
    return ct
        

In [104]:
def tok_stem_tweets(clean_tweets):
    st=[]
    tokenizer=TweetTokenizer(preserve_case=False,strip_handles=True,reduce_len=True)
    stemmer=PorterStemmer()
    sw=stopwords.words('english')
    for t in clean_tweets:
        tokens=tokenizer.tokenize(t)
        stem_tweet=[]
        for word in tokens:
            if(word not in sw and word not in string.punctuation):
                stem_word=stemmer.stem(word)
                stem_tweet.append(stem_word)
        st.append(stem_tweet)
    return st
            

In [105]:
def gen_freq_table(stem_tweets,target):
    freq={}
    target=np.squeeze(target).tolist()
    for y,tweet in zip(target,stem_tweets):
        for word in tweet:
            pair=(word,y)
            if pair in freq:
                freq[pair]+=1
            else:
                freq[pair]=1
    return freq
    

In [106]:
def gen_features(stem_tweets,freq):
    feature_matrix=[]
    for t in stem_tweets:
        t_features=[1]
        pos=0
        neg=0
        for word in t:
            try:
                pos=pos+freq[(word,1)]
            except:
                None
            try:
                neg=neg+freq[(word,0)]
            except:
                None
        t_features.append(pos)
        t_features.append(neg)
        feature_matrix.append(t_features)
    return feature_matrix
                
            

In [107]:
cleanx1=clean_tweets(train_x)

In [108]:
stemx1=tok_stem_tweets(cleanx1)

In [109]:
freq=gen_freq_table(stemx1,train_y)
freq

{('followfriday', 1.0): 23,
 ('top', 1.0): 29,
 ('engag', 1.0): 7,
 ('member', 1.0): 14,
 ('commun', 1.0): 27,
 ('week', 1.0): 72,
 (':)', 1.0): 2960,
 ('hey', 1.0): 58,
 ('jame', 1.0): 7,
 ('odd', 1.0): 2,
 (':/', 1.0): 5,
 ('pleas', 1.0): 80,
 ('call', 1.0): 27,
 ('contact', 1.0): 4,
 ('centr', 1.0): 1,
 ('02392441234', 1.0): 1,
 ('abl', 1.0): 6,
 ('assist', 1.0): 1,
 ('mani', 1.0): 28,
 ('thank', 1.0): 510,
 ('listen', 1.0): 15,
 ('last', 1.0): 39,
 ('night', 1.0): 55,
 ('bleed', 1.0): 2,
 ('amaz', 1.0): 41,
 ('track', 1.0): 5,
 ('scotland', 1.0): 2,
 ('congrat', 1.0): 15,
 ('yeaaah', 1.0): 1,
 ('yipppi', 1.0): 1,
 ('accnt', 1.0): 2,
 ('verifi', 1.0): 2,
 ('rqst', 1.0): 1,
 ('succeed', 1.0): 1,
 ('got', 1.0): 56,
 ('blue', 1.0): 8,
 ('tick', 1.0): 1,
 ('mark', 1.0): 1,
 ('fb', 1.0): 4,
 ('profil', 1.0): 2,
 ('15', 1.0): 4,
 ('day', 1.0): 185,
 ('one', 1.0): 91,
 ('irresist', 1.0): 2,
 ('like', 1.0): 186,
 ('keep', 1.0): 54,
 ('love', 1.0): 291,
 ('custom', 1.0): 3,
 ('wait', 1.0): 5

In [110]:
x1=gen_features(stemx1,freq)

In [111]:
x1

[[1, 3132, 61],
 [1, 3690, 406],
 [1, 3119, 116],
 [1, 2975, 4],
 [1, 3229, 226],
 [1, 3053, 120],
 [1, 3992, 535],
 [1, 3277, 278],
 [1, 655, 188],
 [1, 199, 65],
 [1, 3127, 55],
 [1, 3254, 130],
 [1, 4065, 738],
 [1, 3192, 205],
 [1, 3412, 262],
 [1, 987, 239],
 [1, 2969, 10],
 [1, 1192, 248],
 [1, 3116, 30],
 [1, 3641, 536],
 [1, 3606, 344],
 [1, 3074, 113],
 [1, 3117, 122],
 [1, 3622, 447],
 [1, 3003, 57],
 [1, 1062, 93],
 [1, 3342, 406],
 [1, 2988, 26],
 [1, 3214, 129],
 [1, 569, 29],
 [1, 3001, 124],
 [1, 3028, 87],
 [1, 2997, 25],
 [1, 4297, 525],
 [1, 639, 80],
 [1, 4065, 738],
 [1, 3136, 79],
 [1, 661, 126],
 [1, 817, 170],
 [1, 1015, 561],
 [1, 851, 177],
 [1, 791, 169],
 [1, 3706, 119],
 [1, 3496, 529],
 [1, 6462, 338],
 [1, 1108, 200],
 [1, 3521, 544],
 [1, 2995, 42],
 [1, 3141, 78],
 [1, 895, 287],
 [1, 3241, 239],
 [1, 3132, 61],
 [1, 3206, 288],
 [1, 750, 170],
 [1, 3098, 77],
 [1, 2978, 10],
 [1, 3410, 285],
 [1, 3307, 198],
 [1, 6425, 490],
 [1, 524, 0],
 [1, 2978, 12]

In [112]:
cleanx2=clean_tweets(test_x)

In [113]:
stemx2=tok_stem_tweets(cleanx2)
stemx2

[['bro',
  'u',
  'wan',
  'cut',
  'hair',
  'anot',
  'ur',
  'hair',
  'long',
  'liao',
  'bo',
  'sinc',
  'ord',
  'liao',
  'take',
  'easi',
  'lor',
  'treat',
  'save',
  'leav',
  'longer',
  ':)',
  'lol',
  'sibei',
  'xialan'],
 ['back', 'thnx', 'god', "i'm", 'happi', ':)'],
 ['thought',
  'ear',
  'malfunct',
  'thank',
  'good',
  'clear',
  'one',
  'apolog',
  ':-)'],
 ['stuck', 'centr', 'right', 'clown', 'right', 'joker', 'left', ':)'],
 ['happi', 'friday', ':-)'],
 ['follow', ':)', 'x'],
 ['teenchoic',
  'choiceinternationalartist',
  'superjunior',
  'fight',
  'oppa',
  ':D'],
 ['birthday',
  'today',
  'birthday',
  'wish',
  'hope',
  "there'",
  'good',
  'news',
  'ben',
  'soon',
  ':-)'],
 ['good',
  'morn',
  ':-)',
  'friday',
  '\U000fec00',
  'plan',
  'day',
  'current',
  'play',
  'shop'],
 ['happi', 'friday', ':)'],
 ['3', 'good', 'nigth', ':)', 'estoy', 'escuchando', 'enemi', 'god'],
 ['actual', 'bye', 'bye', 'inde', 'go', 'take', 'drama', 'elsewher

In [114]:
x2=gen_features(stemx2,freq)
x2

[[1, 3369, 401],
 [1, 3411, 391],
 [1, 1376, 327],
 [1, 3052, 114],
 [1, 802, 27],
 [1, 3393, 283],
 [1, 551, 19],
 [1, 1136, 425],
 [1, 1156, 294],
 [1, 3210, 29],
 [1, 3308, 146],
 [1, 3140, 252],
 [1, 3108, 36],
 [1, 3049, 104],
 [1, 2966, 4],
 [1, 3470, 95],
 [1, 3163, 216],
 [1, 528, 3],
 [1, 567, 14],
 [1, 802, 27],
 [1, 3507, 115],
 [1, 2960, 6],
 [1, 2964, 3],
 [1, 738, 52],
 [1, 3133, 121],
 [1, 2964, 8],
 [1, 2963, 4],
 [1, 773, 78],
 [1, 1354, 563],
 [1, 3187, 71],
 [1, 3303, 64],
 [1, 580, 84],
 [1, 3155, 170],
 [1, 3314, 444],
 [1, 3105, 117],
 [1, 549, 44],
 [1, 3109, 58],
 [1, 117, 16],
 [1, 3410, 440],
 [1, 576, 13],
 [1, 2962, 3],
 [1, 1275, 423],
 [1, 3201, 155],
 [1, 628, 22],
 [1, 550, 36],
 [1, 4172, 471],
 [1, 3245, 171],
 [1, 3908, 278],
 [1, 539, 29],
 [1, 3148, 118],
 [1, 3031, 53],
 [1, 3074, 160],
 [1, 3210, 29],
 [1, 3176, 149],
 [1, 3071, 189],
 [1, 155, 28],
 [1, 3344, 306],
 [1, 3428, 455],
 [1, 3348, 243],
 [1, 3512, 99],
 [1, 613, 49],
 [1, 3697, 201],


In [115]:
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression()

In [116]:
clf.fit(x1,train_y)

  y = column_or_1d(y, warn=True)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [117]:
clf.score(x1,train_y)

0.994375

In [118]:
clf.score(x2,test_y)

0.995