In [15]:
import string
import re
import pandas as pd
import time

from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import accuracy_score, log_loss
from fasttext import supervised, load_model

### Loading data

In [16]:
data_rt = pd.read_csv('reviews_rt_all.csv',sep='|')
data_rt.head()
# data_rt.info()

Unnamed: 0,label,text
0,1,"To an entire generation of filmgoers, it just ..."
1,1,Pixar classic is one of the best kids' movies ...
2,1,Apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's..."
4,1,Introduced not one but two indelible character...


In [17]:
data_imdb = pd.read_csv('imdb_small.csv',sep='|')
data_imdb.head()
# data_imdb.info()

Unnamed: 0,label,text
0,0,Once again Mr. Costner has dragged out a movie...
1,0,This is an example of why the majority of acti...
2,0,"First of all I hate those moronic rappers, who..."
3,0,Not even the Beatles could write songs everyon...
4,0,Brass pictures (movies is not a fitting word f...


### Processing data

* To make text look more reasonable, we fixed haven't|doesn't|shouldn't cases
* Also, as same actors can play both in good and bad movies we attempted to remove them (at least such cases when their names are in brackets). So we revomed all texts that are presented in brackets.
* After that we removed digits, special signs and one-letter words

In [5]:
redundant_signs = set(string.punctuation) - set(['.'])
letters = [x for x in string.ascii_lowercase + '. ']

def clean_data(inp_str):
    inp_str = inp_str.lower()

    # fix haven't|doesn't|shouldn't cases
    inp_str = inp_str.replace("n't", " not")
    inp_str = inp_str.replace("'re'", " are")

    # here may be actor's names, types of smth etc. I guess it's redundant info
    # let's discuss of necessity of this block
    bracket_words = re.findall('([\(\[\{].+?[\)\]\}])', inp_str)
    for word in bracket_words:
        inp_str = inp_str.replace(''.join(word), "")

    # replace redundant_signs
    for item in redundant_signs:
        inp_str = inp_str.replace(item, ' ')

    # replace digits
    inp_str = re.sub('\d', ' ', inp_str)
    # replace two or more dots. 1 dot is remained as it separates sentences
    inp_str = re.sub('\.{1,10}', ' ', inp_str)
    # replace one-letter words or just letters
    inp_str = re.sub(r"\b[a-z]{1}\b", ' ', inp_str)

    return ' '.join(list(filter(None, inp_str.split(' '))))

In [23]:
data_imdb['clean_text'] = data_imdb['text'].apply(clean_data)
print('source data_imdb: ', data_imdb.shape)
data_imdb.head()

(50000, 3)


Unnamed: 0,label,text,clean_text
0,0,Once again Mr. Costner has dragged out a movie...,once again mr costner has dragged out movie fo...
1,0,This is an example of why the majority of acti...,this is an example of why the majority of acti...
2,0,"First of all I hate those moronic rappers, who...",first of all hate those moronic rappers who co...
3,0,Not even the Beatles could write songs everyon...,not even the beatles could write songs everyon...
4,0,Brass pictures (movies is not a fitting word f...,brass pictures really are somewhat brassy thei...


In [24]:
data_rt['clean_text'] = data_rt['text'].apply(clean_data)
print('source data_rt: ', data_rt.shape)
data_rt.head()

(102610, 3)


Unnamed: 0,label,text,clean_text
0,1,"To an entire generation of filmgoers, it just ...",to an entire generation of filmgoers it just m...
1,1,Pixar classic is one of the best kids' movies ...,pixar classic is one of the best kids movies o...
2,1,Apesar de representar um imenso avanço tecnoló...,apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's...",when woody perks up in the opening scene it no...
4,1,Introduced not one but two indelible character...,introduced not one but two indelible character...


### Finalizing data

* To replace all stop-words we usel nltk library. Stop words such words as: in, at, over, the, under etc.
* As we identified that data also contains non-eglish languages, we decided to remove them. We checked every sentence and if it contaion non ascii signs we make a dision that this sentence is written in another language, so it should be removed.
* Also, as there are no Leminizer in Gensim libraty for python 3 and leminizier in NLTK library is worse we decided to perform stemming from NLTK library. (gensim leminizer processes the whole sentence while NLTK can process only word separately. It makes impossible to use Gensim and resource expensive to use NLTK.
* We replaced data with empty text field
* As fastText needs labels to be marked, our labels are presented ins such way: __label__1 or __label__0

In [18]:
stop_words = list(set(stopwords.words('english')))
stemmer = PorterStemmer()

def finalize_data(df):
    df['clean_text'] = df['clean_text'].apply(
        lambda x: ' '.join([item for item in x.split() if item not in stop_words]))
    df['is_ascii'] = df['clean_text'].apply(lambda x: set(x).issubset(letters))
    df['letters'] = df['clean_text'].apply(len)
    df['words'] = df['clean_text'].apply(lambda x: len(x.split()))
    df['stemed_text'] = df['clean_text'].apply(lambda x: " ".join([stemmer.stem(w) for w in x.split()]))
    df['new_label'] = df['label'].apply(lambda x: '__label__1 ' if x == 1 else '__label__0 ')

    df = df[df['is_ascii'] == 1]
    df = df[df['letters'] > 0]
    df = df.reset_index()
    df = df.ix[:, ['new_label', 'stemed_text']]

    return df

In [25]:
data_imdb = finalize_data(data_imdb)
print('cleaned data_imdb: ', data_imdb.shape)
data_imdb.head()

(102610, 3)


Unnamed: 0,label,text,clean_text
0,1,"To an entire generation of filmgoers, it just ...",to an entire generation of filmgoers it just m...
1,1,Pixar classic is one of the best kids' movies ...,pixar classic is one of the best kids movies o...
2,1,Apesar de representar um imenso avanço tecnoló...,apesar de representar um imenso avanço tecnoló...
3,1,"When Woody perks up in the opening scene, it's...",when woody perks up in the opening scene it no...
4,1,Introduced not one but two indelible character...,introduced not one but two indelible character...


In [None]:
data_rt = finalize_data(data_rt)
print('cleaned data_rt: ', data_rt.shape)
data_rt.head()

### Model learning and accuracy evaluation

For splitting and evaluation our model we will use __StratifiedShuffleSplit__ function. 

It's a combination of StratifiedKFold and ShuffleSplit. For each dataset will be teached a model and its accuracy will be evaluated by 5 folds.

The main task is to teach 2 clasifiers with different datasets and check how they are applicable to each other. The main difference between data is that imdb has long reviews and rt's reviews are short.

In [None]:
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.25, random_state=42)

In [None]:
print('IMDB dataset:\n')

for item in list(sss.split(data_imdb['stemed_text'], data_imdb['new_label'])):
    train_index, test_index = item
    train_df = data_imdb.ix[train_index]
    test_df = data_imdb.ix[test_index]
    test_df['label'] = test_df['new_label'].apply(lambda x: int(x.strip()[-1]))
    train_df.to_csv('train_imdb.txt', header=False, encoding='utf-8', index=False)
    # test_df.to_csv('test_imdb.txt', header=False, encoding='utf-8', index=False)
    
    start_time = time.time()
    classifier = supervised('train_imdb.txt', 'model_imdb', label_prefix='__label__')
    learning_time = time.time()
    prediction = classifier.predict_proba(list(test_df['stemed_text']))
    prediction_time = time.time()
    train_predictions = [int(item[0][0]) for item in prediction]
    train_probabilities = [item[0][1] for item in prediction]

    print("=" * 30)
    print('****Results****')
    print('Learning model: %d seconds' % round(learning_time - start_time, 2))
    print('Cross-validation time: %d seconds' % round(prediction_time - learning_time, 2))
    acc = accuracy_score(test_df['label'], train_predictions)
    print("Accuracy: {:.4%} \n".format(acc))

In [None]:
print('RT dataset:\n')

for item in list(sss.split(data_rt['stemed_text'], data_rt['new_label'])):
    train_index, test_index = item
    train_df = data_rt.ix[train_index]
    test_df = data_rt.ix[test_index]
    test_df['label'] = test_df['new_label'].apply(lambda x: int(x.strip()[-1]))
    train_df.to_csv('train_rt.txt', header=False, encoding='utf-8', index=False)
    # test_df.to_csv('test_imdb.txt', header=False, encoding='utf-8', index=False)
    
    start_time = time.time()
    classifier = supervised('train_rt.txt', 'model_rt', label_prefix='__label__')
    learning_time = time.time()
    prediction = classifier.predict_proba(list(test_df['stemed_text']))
    prediction_time = time.time()
    train_predictions = [int(item[0][0]) for item in prediction]
    train_probabilities = [item[0][1] for item in prediction]

    print("=" * 30)
    print('****Results****')
    print('Learning model: %d seconds' % round(learning_time - start_time, 2))
    print('Cross-validation time: %d seconds' % round(prediction_time - learning_time, 2))
    acc = accuracy_score(test_df['label'], train_predictions)
    print("Accuracy: {:.4%} \n".format(acc))

In [None]:
print('RT clasifier to IMDB dataset:\n')

data_imdb['label'] = data_imdb['new_label'].apply(lambda x: int(x.strip()[-1]))

classifier = load_model('model_rt.bin', label_prefix='__label__')
start_time = time.time()
prediction = classifier.predict_proba(list(data_imdb['stemed_text']))
prediction_time = time.time()
data_predictions = [int(item[0][0]) for item in prediction]
data_probabilities = [item[0][1] for item in prediction]

print("=" * 30)
print('****Results****')
print('Applying clasifier time: %d seconds' % round(start_time - prediction_time, 2))
acc = accuracy_score(data_imdb['label'], train_predictions)
print("Accuracy: {:.4%} \n".format(acc))

In [None]:
print('IMDB clasifier to RT dataset:\n')

data_rt['label'] = data_rt['new_label'].apply(lambda x: int(x.strip()[-1]))

classifier = load_model('model_imdb.bin', label_prefix='__label__')
start_time = time.time()
prediction = classifier.predict_proba(list(data_rt['stemed_text']))
prediction_time = time.time()
data_predictions = [int(item[0][0]) for item in prediction]
data_probabilities = [item[0][1] for item in prediction]

print("=" * 30)
print('****Results****')
print('Applying clasifier time: %d seconds' % round(start_time - prediction_time, 2))
acc = accuracy_score(data_rt['label'], train_predictions)
print("Accuracy: {:.4%} \n".format(acc))