# Machine Learning

In [5]:
import pandas as pd


In [9]:
filepath_dict = {'yelp':   'yelp_labelled.txt',
                 'amazon': 'amazon_labelled.txt',
                 'imdb':   'imdb_labelled.txt'}
filepath_dict.items()

dict_items([('yelp', 'yelp_labelled.txt'), ('amazon', 'amazon_labelled.txt'), ('imdb', 'imdb_labelled.txt')])

In [10]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source 
    df_list.append(df)
df = pd.concat(df_list)
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [49]:
from sklearn.model_selection import train_test_split

In [13]:
df_yelp = df[df['source'] == 'yelp']
X = df_yelp['sentence'].values
Y = df_yelp['label'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

In [14]:
x_train

array(['We will not be coming back.',
       'We waited for forty five minutes in vain.',
       "I could barely stomach the meal, but didn't complain because it was a business lunch.",
       "I'd rather eat airline food, seriously.",
       "Needless to say, I won't be going back anytime soon.",
       'For that price I can think of a few place I would have much rather gone.',
       'To my disbelief, each dish qualified as the worst version of these foods I have ever tasted.',
       "I promise they won't disappoint.",
       'The decor is nice, and the piano music soundtrack is pleasant.',
       'Will go back next trip out.',
       'I found this place by accident and I could not be happier.',
       'The best place to go for a tasty bowl of Pho!',
       'Left very frustrated.', "You can't beat that.",
       'The portion was huge!',
       'We were sat right on time and our server from the get go was FANTASTIC!',
       'I love this place.',
       'I seriously cannot believe th

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

CountVectorizer()

In [17]:
transformed_x_train = vectorizer.transform(x_train)

In [18]:
transformed_x_test = vectorizer.transform(x_test)

In [19]:
transformed_x_train

<750x1724 sparse matrix of type '<class 'numpy.int64'>'
	with 7422 stored elements in Compressed Sparse Row format>

## Train/classify the data using Logistic Regression


In [20]:
from sklearn.linear_model import LogisticRegression

In [22]:
# creating model and call it classifier
classifier = LogisticRegression()

In [23]:
classifier.fit(transformed_x_train, y_train)

LogisticRegression()

In [26]:
score = classifier.score(transformed_x_test, y_test)
score

0.808

## amazon Dataset

In [37]:
df_amazon = df[df['source'] == 'amazon']
X = df_amazon['sentence'].values
Y = df_amazon['label'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

In [38]:
x_train

array(['They work about 2 weeks then break.',
       'I had absolutely no problem with this headset linking to my 8530 Blackberry Curve!',
       'Motorola finally got the voice quality of a bluetooth headset right.',
       'This battery is an excellent bargain!',
       'much better than the hard plastic cases.', 'Good case!.',
       'Not good enough for the price.',
       'A must study for anyone interested in the "worst sins" of industrial design.',
       'i would advise to not purchase this item it never worked very well.',
       'The camera, although rated at an impressive 1.3 megapixels, renders images that fall well below expectations of such a relatively high resolution.',
       'It is unusable in a moving car at freeway speed.',
       'Light weight, I hardly notice it is there.',
       'I especially love the long battery life.',
       "Unreliable - I'm giving up.", 'I love this thing!',
       'Cheap but hey it works.. Was pleasantly suprised given the low cost of thi

## Shape the data in an acceptable form by Regression Model

In [39]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

CountVectorizer()

In [41]:
transformed_x_train = vectorizer.transform(x_train)

In [42]:
transformed_x_test = vectorizer.transform(x_test)

In [43]:
transformed_x_train

<750x1551 sparse matrix of type '<class 'numpy.int64'>'
	with 6870 stored elements in Compressed Sparse Row format>

## Train/classify the data using Logistic Regression

In [46]:
classifier.fit(transformed_x_train, y_train)

LogisticRegression()

In [47]:
score = classifier.score(transformed_x_test, y_test)
score

0.82

## imdb Dataset

In [48]:
df_list = []
for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source 
    df_list.append(df)
df = pd.concat(df_list)
df

Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp
...,...,...,...
743,I just got bored watching Jessice Lange take h...,0,imdb
744,"Unfortunately, any virtue in this film's produ...",0,imdb
745,"In a word, it is embarrassing.",0,imdb
746,Exceptionally bad!,0,imdb


In [50]:
df_imdb = df[df['source'] == 'imdb']
X = df_imdb['sentence'].values
Y = df_imdb['label'].values
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)

In [51]:
x_train

array(['The results, well, are a shame.  ',
       'Not much dialogue, not much music, the whole film was shot as elaborately and aesthetically like a sculpture.  ',
       'She carries the movie well.  ',
       'Now we were chosen to be tortured with this disgusting piece of blatant American propaganda.  ',
       'THERE IS NO PLOT OR STORYLINE!!  ',
       'There is, however, some pretty good acting (at least, for this type of film).  ',
       "The movie is not completely perfect but 'Titta Di Girolamo' will stay with you for a long time after the vision of the movie.  ",
       'It just blew.  ', 'The movie seemed a little slow at first.  ',
       "It's a feel-good film and that's how I felt when I came out of the cinema!  ",
       "This is a masterful piece of film-making, with many themes simmering and occasionally boiling over in this warts and all study of the poet's bohemian, self-indulgent wartime years that span the aerial bombardments of London and the outward tranquilli

## Shape the data in an acceptable shape by Regression Model

In [52]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
vectorizer.fit(x_train)

CountVectorizer()

In [53]:
transformed_x_train = vectorizer.transform(x_train)

In [54]:
transformed_x_test = vectorizer.transform(x_test)

In [55]:
transformed_x_train

<561x2517 sparse matrix of type '<class 'numpy.int64'>'
	with 8495 stored elements in Compressed Sparse Row format>

## Train/classify the data using Logistic Regression

In [56]:
classifier.fit(transformed_x_train, y_train)

LogisticRegression()

In [57]:
score = classifier.score(transformed_x_test, y_test)
score

0.7379679144385026