In [1]:
import pandas as pd

## Text reading line by line

In [2]:
filepath_dict = {'yelp':   'data/sentiment_analysis/yelp_labelled.txt',
                 'amazon': 'data/sentiment_analysis/amazon_cells_labelled.txt',
                 'imdb':   'data/sentiment_analysis/imdb_labelled.txt'}

In [3]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'review'], sep='\t')
    df['source']=source
    df_list.append(df)
df = pd.concat(df_list)

In [4]:
df.head()

Unnamed: 0,sentence,review,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [5]:
df.iloc[4]

sentence    The selection on the menu was great and so wer...
review                                                      1
source                                                   yelp
Name: 4, dtype: object

## Sklearn for reading sentences

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

In [7]:
sentences = ['John likes ice cream', 'John hates chocolate.']

In [8]:
vectorizer = CountVectorizer(min_df=0, lowercase=False)
vectorizer.fit(sentences)
vectorizer.vocabulary_

{'John': 0, 'likes': 5, 'ice': 4, 'cream': 2, 'hates': 3, 'chocolate': 1}

### Note : vectorizer take each word and put in dectionary

In [9]:
vectorizer.transform(sentences).toarray()

array([[1, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0]])

In [10]:
test= ['hello world John and no one','cream with chocolate']
vectorizer.transform(test).toarray()

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0]])

### Note:: transform() >> toarray() --> return 1 if find the word inside transform , or 0 if not then store it to an array

## Defining a Baseline Model

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
df_yelp = df[df['source'] == 'yelp']

In [13]:
sentences = df_yelp['sentence'].values
review = df_yelp['review'].values

In [14]:
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, review, test_size=0.25, random_state=1000)

In [15]:
vectorizer = CountVectorizer()
vectorizer.fit(sentences_train)

X_train = vectorizer.transform(sentences_train)
X_test  = vectorizer.transform(sentences_test)
X_train

<750x1714 sparse matrix of type '<class 'numpy.int64'>'
	with 7368 stored elements in Compressed Sparse Row format>

## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)
score = classifier.score(X_test, y_test)
print("Accuracy:", score)

Accuracy: 0.796


In [19]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    sentences = df_source['sentence'].values
    y = df_source['review'].values

    sentences_train, sentences_test, y_train, y_test = train_test_split(
        sentences, y, test_size=0.25, random_state=1000)

    vectorizer = CountVectorizer()
    vectorizer.fit(sentences_train)
    X_train = vectorizer.transform(sentences_train)
    X_test  = vectorizer.transform(sentences_test)

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    score = classifier.score(X_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))

Accuracy for yelp data: 0.7960
Accuracy for amazon data: 0.7960
Accuracy for imdb data: 0.7487


### Note:: use unique() to take only unique value without repeated