# 1. Import and concatenation of datasets

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('data/fixed_raw_data.csv')
data = data[['Review', 'Rate']]
data = data[data.Rate != 3].reset_index(drop=True)
data.Rate = data.Rate.apply(lambda x: x//3)

In [3]:
data.Rate.value_counts()

1    4237
0      76
Name: Rate, dtype: int64

It is a very strong imbalance. Fortunately, we have already parsed additional negative reviews. The method of parse was the same as earlier.

In [4]:
neg_data = pd.read_csv('data/neg_data.csv')

In [5]:
data = pd.concat([data, neg_data]).reset_index(drop=True)

In [6]:
data.Rate.value_counts()

1    4237
0    3332
Name: Rate, dtype: int64

It is better now. However, a part of new dataset is not written by German speakers.

# 2. Data cleaning

During parsing several problems emerges. Firstly, review with videos has 'Your browser does not support HTML5 video.' in the post. Secondly, sometimes there are other languages and we want only German reviews.

In [7]:
def support_html_del(text):
    if text[:42] == 'Your browser does not support HTML5 video.':
        text = text[42:]
    return text

In [8]:
data.Review = data.Review.apply(support_html_del)

In [9]:
foreign_words = [' very', 
                 ' with', 
                 ' the', 
                 ' great', 
                 ' need', 
                 ' love', 
                 ' awesome', 
                 'just', 
                 ' say', 
                 ' ik ', 
                 ' did ', 
                 ' bad ', 
                 ' sono ', 
                 ' to be ', 
                 ' le ', 
                 ' el ', 
                 ' as ', 
                 ' al ', 
                 ' per ', 
                 ' if ', 
                 ' de ', 
                 ' il ', 
                 ' would ', 
                 ' la ', 
                 ' this ', 
                 ' had ', 
                 ' working ', 
                 ' beware', 
                 ' go ', 
                 ' after ', 
                 ' is ', 
                 ' too ', 
                 ' di ', 
                 ' se ', 
                 ' si ', 
                 ' senza ', 
                 ' buen', 
                 ' ritardo ', 
                 ' normalito ', 
                 ' vale ', 
                 ' ma ', 
                 ' telefono ', 
                 ' e ',
                 ' mi ', 
                 ' con ',
                 ' do ',
                 ' lo ',
                 ' un ',
                 ' y ',
                 ' nul ',
                 ' and ',
                 ' cose ',
                 ' receive',
                 ' return',
                 ' en ',
                 ' buono ',
                 ' tiene ',
                 ' muy ',
                 ' arrivato ',
                 ' bene ',
                 ' pas ',
                 ' lousy',
                 ' off ',
                 ' lo ',
                 ' anche ',
                 ' buon ',
                 ' decepcionante ',
                 ' prodotto ',
                 ' recomiendo ',
                 ' regular ',
                 ' venditore',
                 ' em ',
                 ' se ',
                 ' meu ',
                 ' problemsa',
                 ' mas ',
                 ' trave ',
                 ' poca ',
                 ' molto ',
                 ' dando ',
                 ' flop ',
                 ' capacida',
                 ' q ',
                 ' produto ',
                 ' mo ',
                 ' facile ',
                 ' C ',
                 ' Kit ',
                 ' and ',
                 ' me ',
                 ' worst ',
                 ' ever ',
                 ' a ',
                 ' good ',
                 ' fun ',
                 ' vendeur ']

In [10]:
def is_german_language(text):
    text = text.lower()
    text = ' ' + text + ' '
    for l in text:
        if l in 'çàąćęłńóśźżйцкнгшщзъфывплджэячмиьбюéèáñに対応との事ですが実際使ってみると動作が遅くなりますこの価格ならあと数千円出して有機ELディスプレイのãéíáóù':
            return False
    for i in foreign_words:
        if i in text:
            return False
    return True

In [11]:
data = data[data.Review.apply(is_german_language)].reset_index(drop=True)

In [12]:
data.Rate.value_counts()

1    4176
0    2667
Name: Rate, dtype: int64

Not perfect, but we can work with this.

In [13]:
data.to_csv('data/data.csv', index=False)

# 3. Train and test split

In [14]:
data_train, data_test = train_test_split(data, test_size=0.1, shuffle=True, stratify=data['Rate'].values, random_state=108)

In [15]:
data_train.to_csv('data/train.csv', index=False)
data_test.to_csv('data/test.csv', index=False)