In [20]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression


In [21]:
filepath_dict = {'yelp': 'data/yelp_labelled.txt',
                'amazon': 'data/amazon_cells_labelled.txt',
                'imdb': 'data/imdb_labelled.txt'}

In [22]:
df_list = []

for source, filepath in filepath_dict.items():
    df = pd.read_csv(filepath, names=['sentence', 'label'], sep='\t')
    df['source'] = source
    df_list.append(df)

In [23]:
df = pd.concat(df_list)

In [24]:
df_yelp = df[ df['source']=='yelp']
df_yelp.head(5)


Unnamed: 0,sentence,label,source
0,Wow... Loved this place.,1,yelp
1,Crust is not good.,0,yelp
2,Not tasty and the texture was just nasty.,0,yelp
3,Stopped by during the late May bank holiday of...,1,yelp
4,The selection on the menu was great and so wer...,1,yelp


In [25]:
for source in df['source'].unique():
    df_source = df[df['source'] == source]
    X = df_source['sentence'].values
    Y = df_source['label'].values
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=100)
    vectorizer = CountVectorizer()
    vectorizer.fit(x_train)
    transformed_x_train = vectorizer.transform(x_train)
    transformed_x_test = vectorizer.transform(x_test)
    classifier = LogisticRegression()
    classifier.fit(transformed_x_train, y_train)
    score = classifier.score(transformed_x_test, y_test)
    print('Accuracy for {} data: {:.4f}'.format(source, score))
    

Accuracy for yelp data: 0.8080
Accuracy for amazon data: 0.8200
Accuracy for imdb data: 0.7380


# conclusions

A baseline is a method that uses heuristics, simple summary statistics that provides reasonable results on a task and does not require much expertise and time to build. Common baseline models include linear regression when predicting continuous values, logistic regression when classifying structured data, pretrained convolutional neural networks for vision related tasks, and recurrent neural networks and gradient boosted trees for sequence modeling (A to measure the baseline's performance (e.g., accuracy)
this metric will then become what you compare any other machine learning algorithm against.
A machine learning algorithm tries to learn a function that models the relationship between the input (feature) data and the target variable label.