In [92]:
import pandas as pd
import numpy as np
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

def setup():
    yelp_df = pd.read_csv(
        "sentiment_labelled_sentences/yelp_labelled.txt", 
        "\t", 
        encoding='utf-8',
        header = None, 
        names = ["review", "score"]
    )
    amazon_df = pd.read_csv(
        "sentiment_labelled_sentences/amazon_cells_labelled.txt", 
        "\t", 
        header = None, 
        names = ["review", "score"]
    )
    imdb_df = pd.read_csv(
        "sentiment_labelled_sentences/imdb_labelled.txt", 
        "\t", 
        header = None,
        names = ["review", "score"]
    )
    
    yelp_df = yelp_df.dropna()
    amazon_df = amazon_df.dropna()
    imdb_df = imdb_df.dropna()

    yelp_df['src'] = pd.Series("yelp", index = yelp_df.index)
    amazon_df['src'] = pd.Series("amazon", index = amazon_df.index)
    imdb_df['src'] = pd.Series("imdb", index = imdb_df.index)
    
    data_df = pd.concat([yelp_df, amazon_df, imdb_df])
    
    return data_df
    
setup()


Unnamed: 0,review,score,src
0,Wow... Loved this place.,1,yelp
3,Crust is not good.,0,yelp
4,Not tasty and the texture was just nasty.,0,yelp
10,Stopped by during the late May bank holiday of...,1,yelp
11,The selection on the menu was great and so wer...,1,yelp
12,Now I am getting angry and I want my damn pho.,0,yelp
13,Honeslty it didn't taste THAT fresh.),0,yelp
16,The potatoes were like rubber and you could te...,0,yelp
18,The fries were great too.,1,yelp
19,A great touch.,1,yelp


In [3]:
def prt_a():
    data_df = setup()
    score = data_df["score"].value_counts()

    print("ratio of 0 to 1: {0}".format(score[0]/float(score[1])) )

prt_a()

ratio of 0 to 1: 0.982683982684


In [103]:
def clean_data(data_df):
    #Lower casing
    data_df["review"] = data_df["review"].str.lower()
    
    #Stripping Punctuation     
    data_df["review"] = data_df["review"].str.replace('[^\w\s]','')
    
    # Lemmatizing using NLTK stemmer
    stemmer = PorterStemmer()     
    new_arr=[]
    for review in data_df["review"]:
        new_sent=[]
        for word in review.split():
            try:
                if word not in stopwords.words("english"):
                    new_sent.append(stemmer.stem(word))
            except:
                pass
        new_arr.append(" ".join(new_sent))
    data_df["creview"]=np.array(new_arr)

def prt_b():
    data_df = setup()
    clean_data(data_df)
    print(data_df)

prt_b()

                                                review  score   src  \
0                                 wow loved this place      1  yelp   
3                                    crust is not good      0  yelp   
4             not tasty and the texture was just nasty      0  yelp   
10   stopped by during the late may bank holiday of...      1  yelp   
11   the selection on the menu was great and so wer...      1  yelp   
12       now i am getting angry and i want my damn pho      0  yelp   
13                  honeslty it didnt taste that fresh      0  yelp   
16   the potatoes were like rubber and you could te...      0  yelp   
18                            the fries were great too      1  yelp   
19                                       a great touch      1  yelp   
23                             service was very prompt      1  yelp   
24                                   would not go back      0  yelp   
26   the cashier had no care what so ever on what i...      0  yelp   
28   i

In [117]:
def test_train_split(data_df):
    amazon=data_df[data_df["src"]=="amazon"]
    yelp=data_df[data_df["src"]=="yelp"]
    imdb=data_df[data_df["src"]=="imdb"]
    
    amazon_neg=amazon[amazon["score"]==0]
    amazon_pos=amazon[amazon["score"]==1]
    imdb_neg=imdb[imdb["score"]==0]
    imdb_pos=imdb[imdb["score"]==1]
    yelp_neg=yelp[yelp["score"]==0]
    yelp_pos=yelp[yelp["score"]==1]
    
    training=pd.concat([
        amazon_neg.iloc[0:399],
        amazon_pos.iloc[0:399],
        yelp_pos.iloc[0:399],
        yelp_neg.iloc[0:399],
        imdb_pos.iloc[0:308],
        imdb_neg.iloc[0:288]
    ])
    testing=pd.concat([
        amazon_neg.iloc[400:499],
        amazon_pos.iloc[400:499],
        yelp_pos.iloc[400:499],
        yelp_neg.iloc[400:499],
        imdb_pos.iloc[309:386], #386
        imdb_neg.iloc[289:362]    
    ])
    
    return(training,testing)

def part_c():
    data_df = setup()
    clean_data(data_df)
    training,testing=test_train_split(data_df)
    print(training)
    
part_c()

                                                review  score     src  \
1    so there is no way for me to plug it in here i...      0  amazon   
10   tied to charger for conversations lasting more...      0  amazon   
13   i have to jiggle the plug to get it to line up...      0  amazon   
15   if you have several dozen or several hundred c...      0  amazon   
18                   needless to say i wasted my money      0  amazon   
21                      what a waste of money and time      0  amazon   
29   if the two were seperated by a mere 5 ft i sta...      0  amazon   
33   the design is very odd as the ear clip is not ...      0  amazon   
36                  i advise everyone do not be fooled      0  amazon   
41   it clicks into place in a way that makes you w...      0  amazon   
42   i went on motorolas website and followed all d...      0  amazon   
51             the commercials are the most misleading      0  amazon   
56   i bought it for my mother and she had a proble

In [125]:
DATA_DF = setup()
clean_data(DATA_DF)
TRAINING,TESTING = test_train_split(DATA_DF)

In [194]:
def bag_of_words(training):
    dict_s = pd.Series()
    for review in training["creview"]:
        for word in review.split():
            if word not in dict_s:
                dict_s[word] = 1
#             else:
#                 dict_s[word] += 1
    return dict_s

def counter(data_df, bag):
    for review in data_df["creview"]:
        for word in review.split():
            if word in bag:
                bag[word] += 1
    return bag.sort_values()

def featurenator(review, count_bag):
    vector = np.zeros(len(count_bag))
    for word in review.split():
        try:
            ind = count_bag.keys().get_loc(word)
            vector[ind] +=1
        except:
            print("Error in looking up: "+word)
    return vector

def part_d():
    data_df = DATA_DF
    training = TRAINING
    testing = TESTING
    bag = bag_of_words(training)
    count_bag = counter(data_df, bag)
    fvector1 = featurenator(data_df["creview"].iloc[0], count_bag)
    fvector2 = featurenator(data_df["creview"].iloc[20], count_bag)
    print(fvector1, fvector2)

part_d()

BAG = counter(DATA_DF, bag_of_words(TRAINING))

(array([ 0.,  0.,  0., ...,  0.,  0.,  0.]), array([ 0.,  0.,  0., ...,  0.,  0.,  0.]))


In [247]:
from numpy import linalg as la

def standardize(feature):
    mean = np.mean(feature)
    var = np.var(feature)
    for i in (feature - mean)/var :
        if (i-1.00081855389) > 0.000001:
            print(i)
    
def el2(feature):
    normed=feature / la.norm(feature)
    print(la.norm(feature))
    for i in normed:
        if i > 0.000001:
            print i

def make_features(data_df,bag):
    features=[]
    for review in data_df["creview"].values:
        features.append(featurenator(review,bag))
    return features
    
def part_e():
    data_df = DATA_DF
    bag = BAG
    training=TRAINING
#     feature = featurenator(data_df["creview"].iloc[0], bag)
# #     standardize(feature)
#     el2(feature)
    make_features(training,bag)
#     print(u"cart" in bag)
part_e()  

In [211]:
FEATURES = make_features(TRAINING,BAG)

In [267]:
from numpy import linalg as la
def random_initial(feature_vectors):
    data_min=np.min(feature_vectors)
    data_max=np.max(feature_vectors)
    feature_len=len(feature_vectors[0])
    random_vec = np.random.uniform(data_min,data_max,feature_len)
    return random_vec
    
def setup_initials(k,feature_vectors):
    centroids=[]
    for i in range(k):
        centroids.append(random_initial(feature_vectors))
    return centroids

def k_means(k,feature_vectors):
    return k_mean_help(k,feature_vectors,setup_initials(k,feature_vectors),{"CHEAT":"CHEAT"})
    
def k_mean_help(k,feature_vectors,centroids,old_rev_dic):
    label_dic={}
    rev_dic={}
    for i in range(len(feature_vectors)):
        assign_closest(i,feature_vectors[i],centroids,label_dic,rev_dic)
    
    print("centroids: "+str(centroids))
    
    if rev_dic==old_rev_dic:
        return (label_dic,rev_dic)
    
    new_centroids=recalc_centroids(feature_vectors,rev_dic)
    print("new_centroids: "+str(new_centroids))
    return k_mean_help(k,feature_vectors,new_centroids,rev_dic)
#     print(label_dic)
#     print(rev_dic)

def assign_closest(vector_alias,feature_vector,centroids,label_dic,rev_dic):
    dists=[]
    for centroid in centroids:
        dists.append(la.norm(feature_vector-centroid))
    
    #Vector -> Which centroid its assigned to
    which_centroid=np.argmin(dists)
    label_dic[vector_alias]=which_centroid
    
    #Which centroid -> list of vectors
    if which_centroid in rev_dic:
        rev_dic[which_centroid].append(vector_alias)
    else:
        rev_dic[which_centroid]=[vector_alias]
        
def reconst_features(feat_indexes,feature_vectors):
    features=[]
    for el in feat_indexes:
        features.append(feature_vectors[el])
    return np.array(features)
        
        
def recalc_centroids(feature_vectors,rev_dic):
    new_centroids=[]
    print("keys: "+str(rev_dic.keys()))
    for key in rev_dic.keys():
        group_features=reconst_features(rev_dic[key],feature_vectors)
        new_centroids.append(np.mean(group_features, axis=0))
    return new_centroids
        
def part_f():
    feature_vectors=FEATURES
#     print(np.max(feature_vectors), np.min(feature_vectors))
#     k_means(2,feature_vectors)
    fake_features=np.array([[1,1],[1,3],[3,5],[4,5],[5,4]])
    fake_features2=np.array([[1,1],[1,5],[3,4],[3,7],[1,9],[4,6]])#,[8,3],[8,5],[9,3],[9,5]])
    fake_centroids=np.array([[1,2],[5,5]])
    
    print(k_means(2,fake_features2))
    
#     k_mean_help(2,fake_features,fake_centroids)

part_f()

centroids: [array([ 6.97040242,  8.80704546]), array([ 5.22509012,  6.28911576])]
keys: [1]
new_centroids: [array([ 2.16666667,  5.33333333])]
centroids: [array([ 2.16666667,  5.33333333])]
keys: [0]
new_centroids: [array([ 2.16666667,  5.33333333])]
centroids: [array([ 2.16666667,  5.33333333])]
({0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0}, {0: [0, 1, 2, 3, 4, 5]})
