In [1]:
import pandas as pd
import numpy as np
from googletrans import Translator
import string
import time
RND_STATE = 515991

## 0. Config

In [2]:
max_string_length = 300

In [3]:
def log_progress(sequence, every=10):
    from ipywidgets import IntProgress
    from IPython.display import display

    progress = IntProgress(min=0, max=len(sequence), value=0)
    display(progress)
    
    for index, record in enumerate(sequence):
        if index % every == 0:
            progress.value = index
        yield record

In [4]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

## 1. Loading data

### 1.1 Loading and decoding data

In [5]:
data = pd.read_csv('123.csv', sep=',', encoding='utf8')
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data_length_start = len(data)
print('Initial length: {0}'.format(data_length_start))
data.describe()

Initial length: 24733


Unnamed: 0,from,from_id,message,picture,link,name,description,type,created_time,shares,post_id
count,24643,24621,17456,22342,22861,19305,7533,24564,24562,13340,24558
unique,757,744,14711,18236,18901,5899,6114,18,21051,294,21301
top,Russian Tour,_75258759941,Buonanotte!,https://external.xx.fbcdn.net/safe_image.php?d...,http://www.russiantour.com/,Timeline Photos,Russian Gateway Tours,photo,2013-10-09T01:00:00+0000,1,640760615984253_915559535171025
freq,6130,6130,87,217,18,5015,74,16296,14,3119,2


## 2. Preprocessing

### 2.1 Removing puctuation

In [6]:
data['message'] = data['message'].str.replace(r'['+string.punctuation+']', '')
data['description'] = data['description'].str.replace(r'['+string.punctuation+']', '')

### 2.2 Removing nans and unnecessary columns

In [7]:
data = data.drop(['picture', 'link'], axis=1)
data = data.dropna(how='all')
data = data.dropna(subset=['message'])
print('Removed {0} elements'.format(data_length_start - len(data)))
print('New length is {0}'.format(len(data)))

Removed 7277 elements
New length is 17456


### 2.3 Removing emojis and links

emojis

In [8]:
data['message'] = data['message'].str.replace(u'[^\U00000000-\U0000d7ff\U0000e000-\U0000ffff]', '')

links

In [9]:
data['message'] = data['message'].str.replace(r'^https?:\/\/.*[\r\n]*', '')

### 2.4 Removing advertisements

In [10]:
data = data[~data['from'].str.contains("Visit-Petersburg", na=False)]
print('Removed {0} elements'.format(data_length_start - len(data)))
print('New length is {0}'.format(len(data)))
data.describe()

Removed 8247 elements
New length is 16486


Unnamed: 0,from,from_id,message,name,description,type,created_time,shares,post_id
count,16486,16486,16486,13512,4242,16442,16440,9574,16436
unique,599,601,13710,3853,3431,17,14442,291,14523
top,Russian Tour,_75258759941,Buonanotte,Timeline Photos,Russian Gateway Tours,photo,2013-10-09T01:00:00+0000,1,640760615984253_970827869644191
freq,4480,4480,98,4344,74,10983,14,1977,2


### 2.5 Stripping texts

In [11]:
data['message'] = data['message'].apply(lambda x: x[:max_string_length].strip())
data['description'] = data['message'].apply(lambda x: x[:max_string_length].strip())

### 2.6 Translating texts into english

In [12]:
def translate_big_text(texts_source, dest = 'en', chunk_size = 800, delay=0):
    translated = []
    chunks_list = list(chunks(texts_source, chunk_size))
    i = 1
    for chunk in log_progress(chunks_list, every=1):
        translated.extend(translate(chunk, delay=delay)) 
        print('Translated chunk #{0} of {1}'.format(i, len(chunks_list)))
        i += 1 
    return translated

In [13]:
def translate(texts_source, dest='en', delay=10):
    translator= Translator()
    translated = []
    err_count = 0
    for text in log_progress(texts_source):
        try:
            translated.append(translator.translate(u''.join(text), dest=dest).text)
        except Exception as e:
            translated.append('')
            err_count += 1
#             print('Error while translating ', text)
#             print(e)
    print('Number of errors: ', err_count)
    print('Waiting {0} sec'.format(delay))
    time.sleep(delay)
    return translated

In [14]:
msg_translations = translate_big_text(data['message'].values)

A Jupyter Widget

A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #1 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #2 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #3 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #4 of 21


A Jupyter Widget

Number of errors:  144
Waiting 0 sec
Translated chunk #5 of 21


A Jupyter Widget

Number of errors:  39
Waiting 0 sec
Translated chunk #6 of 21


A Jupyter Widget

Number of errors:  2
Waiting 0 sec
Translated chunk #7 of 21


A Jupyter Widget

Number of errors:  24
Waiting 0 sec
Translated chunk #8 of 21


A Jupyter Widget

Number of errors:  41
Waiting 0 sec
Translated chunk #9 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #10 of 21


A Jupyter Widget

Number of errors:  106
Waiting 0 sec
Translated chunk #11 of 21


A Jupyter Widget

Number of errors:  173
Waiting 0 sec
Translated chunk #12 of 21


A Jupyter Widget

Number of errors:  143
Waiting 0 sec
Translated chunk #13 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #14 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #15 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #16 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #17 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #18 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #19 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #20 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #21 of 21


In [20]:
data['message'] = msg_translations

In [22]:
print('Number of skipped = ', 672)

Number of skipped =  672


In [21]:
descr_translations = translate_big_text(data['description'].values)

A Jupyter Widget

A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #1 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #2 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #3 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #4 of 21


A Jupyter Widget

Number of errors:  166
Waiting 0 sec
Translated chunk #5 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #6 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #7 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #8 of 21


A Jupyter Widget

Number of errors:  22
Waiting 0 sec
Translated chunk #9 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #10 of 21


A Jupyter Widget

Number of errors:  82
Waiting 0 sec
Translated chunk #11 of 21


A Jupyter Widget

Number of errors:  166
Waiting 0 sec
Translated chunk #12 of 21


A Jupyter Widget

Number of errors:  123
Waiting 0 sec
Translated chunk #13 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #14 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #15 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #16 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #17 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #18 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #19 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #20 of 21


A Jupyter Widget

Number of errors:  0
Waiting 0 sec
Translated chunk #21 of 21


In [23]:
data['description'] = descr_translations

In [25]:
print('Number of skipped = ', 559)

Number of skipped =  559


In [26]:
data.to_csv('data_converted.csv')