In [3]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.model_selection import train_test_split  
from sklearn.linear_model import LogisticRegression 

In [7]:
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_cells_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}

df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source  # Add another column filled with the source name
    df_list.append(df)

df = pd.concat(df_list)
print(df.iloc[0])
df.head()

sentence    Wow... Loved this place.
label                              1
source                          yelp
Name: 0, dtype: object


Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [8]:
sen = ['John likes ice cream', 'John hates chocolate.']
vector = CountVectorizer()

In [9]:
vector.fit(sen)

CountVectorizer()

In [10]:
vector.vocabulary_

{'john': 4, 'likes': 5, 'ice': 3, 'cream': 1, 'hates': 2, 'chocolate': 0}

In [11]:
vector.transform(sen).toarray()

array([[0, 1, 0, 1, 1, 1],
       [1, 0, 1, 0, 1, 0]])

In [12]:
sen = df['sentence'].values
sen

array(['Wow... Loved this place.', 'Crust is not good.',
       'Not tasty and the texture was just nasty.', ...,
       'In a word, it is embarrassing.  ', 'Exceptionally bad!  ',
       "All in all its an insult to one's intelligence and a huge waste of money.  "],
      dtype=object)

In [13]:
label = df['label'].values
label

array([1, 0, 0, ..., 0, 0, 0])

In [15]:
x_train , x_test , y_train , y_test = train_test_split(sen,label, test_size=0.3)
x_train


array(['The staff is great, the food is delish, and they have an incredible beer selection.',
       'It will drive you barking mad!  ',
       'THERE IS NO PLOT OR STORYLINE!!  ', ...,
       'Reversible plug works great.',
       'One of the most disappointing aspects is the lack of notable gore.  ',
       'Disapointing Results.'], dtype=object)

In [16]:
vector.fit(x_train)
vector.vocabulary_

{'the': 3650,
 'staff': 3436,
 'is': 1963,
 'great': 1630,
 'food': 1472,
 'delish': 962,
 'and': 166,
 'they': 3663,
 'have': 1708,
 'an': 164,
 'incredible': 1889,
 'beer': 332,
 'selection': 3204,
 'it': 1967,
 'will': 4072,
 'drive': 1121,
 'you': 4143,
 'barking': 300,
 'mad': 2217,
 'there': 3661,
 'no': 2462,
 'plot': 2744,
 'or': 2551,
 'storyline': 3485,
 'only': 2537,
 'reason': 2943,
 'to': 3711,
 'eat': 1169,
 'here': 1747,
 'would': 4121,
 'be': 317,
 'fill': 1405,
 'up': 3881,
 'before': 334,
 'night': 2456,
 'of': 2511,
 'binge': 371,
 'drinking': 1118,
 'just': 2018,
 'get': 1575,
 'some': 3369,
 'carbs': 541,
 'in': 1876,
 'your': 4145,
 'stomach': 3477,
 'we': 4022,
 'recently': 2951,
 'witnessed': 4092,
 'her': 1746,
 'poor': 2762,
 'quality': 2890,
 'management': 2240,
 'towards': 3746,
 'other': 2563,
 'guests': 1657,
 'as': 222,
 'well': 4038,
 'gets': 1576,
 'job': 1994,
 'done': 1083,
 'so': 3357,
 'bad': 281,
 'actually': 91,
 'worth': 4118,
 'seeing': 3199,
 '

In [17]:
X_train = vector.transform(x_train)
X_train
X_test = vector.transform(x_test)
X_test

<825x4157 sparse matrix of type '<class 'numpy.int64'>'
	with 8110 stored elements in Compressed Sparse Row format>

In [18]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test, y_test)

0.8193939393939393

In [19]:
predicted = vector.transform(x_test)
predicted.toarray
model.predict(predicted)

array([0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1,
       1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1,
       0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0,

In [None]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['label'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))