In [2]:
import pandas as pd
import numpy as np
from googletrans import Translator
import string
import time
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import gender_guesser.detector as gender
import matplotlib.pyplot as plt
import json
import numbers
import multiprocessing
from itertools import product
RND_STATE = 515991

%matplotlib inline

## 0. Config

In [3]:
max_string_length = 500
output_csv = 'data_converted.csv'

In [4]:
def log_progress(sequence, every=None, size=None, name='Processed'):
    from ipywidgets import IntProgress, HTML, VBox
    from IPython.display import display

    is_iterator = False
    if size is None:
        try:
            size = len(sequence)
        except TypeError:
            is_iterator = True
    if size is not None:
        if every is None:
            if size <= 200:
                every = 1
            else:
                every = int(size / 200)     # every 0.5%
    else:
        assert every is not None, 'sequence is iterator, set every'

    if is_iterator:
        progress = IntProgress(min=0, max=1, value=1)
        progress.bar_style = 'info'
    else:
        progress = IntProgress(min=0, max=size, value=0)
    label = HTML()
    box = VBox(children=[label, progress])
    display(box)

    index = 0
    try:
        for index, record in enumerate(sequence, 1):
            if index == 1 or index % every == 0:
                if is_iterator:
                    label.value = '{name}: {index} / ?'.format(
                        name=name,
                        index=index
                    )
                else:
                    progress.value = index
                    label.value = u'{name}: {index} / {size}'.format(
                        name=name,
                        index=index,
                        size=size
                    )
            yield record
    except:
        progress.bar_style = 'danger'
        raise
    else:
        progress.bar_style = 'success'
        progress.value = index
        label.value = "{name}: {index}".format(
            name=name,
            index=str(index or '?')
        )

In [5]:
def chunks(l, n):
    for i in range(0, len(l), n):
        yield l[i:i + n]

## 1. Loading data

### 1.1 Loading and decoding data

In [6]:
data = pd.read_csv('123.csv', sep=',', encoding='utf8')
data = data.loc[:, ~data.columns.str.contains('^Unnamed')]
data_length_start = len(data)
print('Initial length: {0}'.format(data_length_start))
data.describe()

Initial length: 24733


Unnamed: 0,from,from_id,message,picture,link,name,description,type,created_time,shares,post_id
count,24643,24621,17456,22342,22861,19305,7533,24564,24562,13340,24558
unique,757,744,14711,18236,18901,5899,6114,18,21051,294,21301
top,Russian Tour,_75258759941,Buonanotte!,https://external.xx.fbcdn.net/safe_image.php?d...,http://www.russiantour.com/,Timeline Photos,Russian Gateway Tours,photo,2013-10-09T01:00:00+0000,1,792410744188300_944984215597618
freq,6130,6130,87,217,18,5015,74,16296,14,3119,2


## 2. Preprocessing

### 2.1 Removing nans and unnecessary columns

In [7]:
data = data.drop(['picture', 'link', 'from_id', 'post_id'], axis=1)
data[['shares']] = data[['shares']].fillna(value=0)
data = data.dropna(how='all')
data = data.dropna(subset=['message'])
print('Removed {0} elements'.format(data_length_start - len(data)))
print('New length is {0}'.format(len(data)))

Removed 7277 elements
New length is 17456


### 2.2 Removing emojis, links and numbers

emojis

In [8]:
data['message'] = data['message'].str.replace(u'[^\U00000000-\U0000d7ff\U0000e000-\U0000ffff]', '')
data['description'] = data['description'].str.replace(u'[^\U00000000-\U0000d7ff\U0000e000-\U0000ffff]', '')

links

In [9]:
data['message'] = data['message'].str.replace(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '')
data['description'] = data['message'].str.replace(r'(https|http)?:\/\/(\w|\.|\/|\?|\=|\&|\%)*\b', '')

numbers

In [10]:
data['message'] = data['message'].str.replace(r'\b\d+\b','')
data['description'] = data['message'].str.replace(r'\b\d+\b','')

special chars

In [11]:
data['message'] = data['message'].str.replace('"', '')
data['message'] = data['message'].str.replace('#', '')

data['description'] = data['description'].str.replace('"', '')
data['description'] = data['description'].str.replace('#', '')

### 2.3 Removing advertisements

In [12]:
data = data[~data['from'].str.contains("Visit-Petersburg", na=False)]
print('Removed {0} elements'.format(data_length_start - len(data)))
print('New length is {0}'.format(len(data)))
data.describe()

Removed 8247 elements
New length is 16486


Unnamed: 0,from,message,name,description,type,created_time,shares
count,16486,16486.0,13512,16486.0,16442,16440,16486
unique,599,13460.0,3853,13460.0,17,14442,292
top,Russian Tour,,Timeline Photos,,photo,2013-10-09T01:00:00+0000,0
freq,4480,286.0,4344,286.0,10983,14,6912


### 2.4 Stripping texts and converting types

In [13]:
def convert_column_int(column):
    converted = []
    for value in column.values:
        if str(value).isdigit():
            converted.append(value)
        else:
            converted.append(0)
    return converted

In [14]:
data['message'] = data['message'].apply(lambda x: x[:max_string_length].strip())
data['description'] = data['message'].apply(lambda x: x[:max_string_length].strip())

In [15]:
data['shares'] = convert_column_int(data['shares'])
data[['shares']] = data[['shares']].apply(pd.to_numeric)

### 2.5 Translating texts into english

In [16]:
def translate_big_text(texts_source, dest = 'en', chunk_size = 250, delay=0):
    translated = []
    chunks_list = list(chunks(texts_source, chunk_size))
    i = 1
    for chunk in log_progress(chunks_list):
        translated.extend(translate_big_text_internal(chunk)) 
        print('Translated chunk #{0} of {1}'.format(i, len(chunks_list)))
        print('Waiting {0} sec'.format(delay))
        time.sleep(delay)
        i += 1 
    return translated

In [17]:
def translate(text, dest='en'):
    translator = Translator()
    translated = ''
    try:
        translated = translator.translate(u''.join(text), dest=dest).text
    except Exception as e:
        print(e)
    return translated

In [18]:
def translate_big_text_internal(texts_source, dest='en'):
    with multiprocessing.Pool(processes=30) as pool:
        results = pool.starmap(translate, product(texts_source))
    
    print('Number of errors: ', results.count(''))
    return results

In [None]:
msg_translations = translate_big_text(data['message'].values)

A Jupyter Widget

In [None]:
data['message'] = msg_translations

In [None]:
print('Number of skipped = ', 672)

In [None]:
descr_translations = translate_big_text(data['description'].values)

In [None]:
data['description'] = descr_translations

In [None]:
print('Number of skipped = ', 559)

In [None]:
names_translations = translate_big_text(data['from'].values)

In [None]:
data['from'] = names_translations

In [None]:
print('Number of skipped = ', 545)

## 2.7 Moving all words to lower case

In [None]:
data["message"] = data["message"].map(lambda x: x if type(x)!=str else x.lower())
data["description"] = data["description"].map(lambda x: x if type(x)!=str else x.lower())

### 2.8 Tokenizing sentences

In [None]:
data['message'] = data['message'].apply(lambda x: nltk.word_tokenize(str(x)))
data['description'] = data['description'].apply(lambda x: nltk.word_tokenize(str(x)))

### 2.9 Removing stopwords and punctuation

In [None]:
stop_words = stopwords.words('english')
stop_words += (['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}', '\n', '/n', '-', '#', 'visitpetersburg', 'visitrussia', 'russia', 'the', 'is', 'a'])

In [None]:
data['message'] = data['message'].apply(lambda x: list(filter(lambda y: y not in stop_words, x)))
data['description'] = data['description'].apply(lambda x: list(filter(lambda y: y not in stop_words, x)))

### 2.10 Saving data to file

In [None]:
def save_csv():
    data.to_csv(output_csv, index=False)

In [None]:
def parse_csv_tuple(series):
    values = series.values
    parsed = []
    for value in values:
        parsed.append(json.loads(value.replace("'",'"')))
    return parsed

In [None]:
def load_csv():
    loaded_data = pd.read_csv(output_csv, sep=',', encoding='utf8')
    loaded_data['message_sentiment_vader'] = parse_csv_tuple(loaded_data['message_sentiment_vader'])
    loaded_data['description_sentiment_vader'] = parse_csv_tuple(loaded_data['description_sentiment_vader'])
    # loaded_data['message'] = parse_csv_arr(loaded_data['message'])
    # loaded_data['description'] = parse_csv_arr(loaded_data['description'])
    return loaded_data

In [None]:
save_csv()

# 3. Processing data

## 3.1 Trying to guess gender by name

In [None]:
detector = gender.Detector(case_sensitive=False)

In [None]:
data['from_gender'] = [detector.get_gender((str(name)).split(' ')[0]) for name in data['from'].values]

In [None]:
data['from_gender'].replace([0],['unknown'],inplace=True)

## 3.1 Getting sentiments by Vader algorithm

In [None]:
sid = SentimentIntensityAnalyzer()
message_sent = []
descr_sent = []
data['message'].apply(lambda x: message_sent.append(sid.polarity_scores((" ".join(x)))));
data['description'].apply(lambda x: descr_sent.append(sid.polarity_scores((" ".join(x)))));
data['message_sentiment_vader'] = message_sent
data['description_sentiment_vader'] = descr_sent

In [None]:
def get_message_sentiment(series, border_line=0.2):
    values = series.values
    parsed_values = []
    for value in values:
        if value['neg'] >= border_line:
            parsed_values.append('negative')
        elif value['pos'] >= border_line:
            parsed_values.append('positive')
        else:
            parsed_values.append('neutral')
    return parsed_values

In [None]:
data['message_sentiment_vader_parsed'] = get_message_sentiment(data['message_sentiment_vader'])
data['description_sentiment_vader_parsed'] = get_message_sentiment(data['description_sentiment_vader'])

In [None]:
save_csv()

In [None]:
hh = load_csv()

## 3.2 Plotting stats 

### 3.2.1 Shares per gender

In [None]:
data.groupby(['from_gender']).sum()

#### 3.2.1.1 Row data

In [None]:
groups = data.groupby(['from_gender']).size()
groups = groups.drop(['andy'], axis=0)
ax = groups.plot(kind = 'bar', grid = True, title = 'Reviews in tourist groups by gender', figsize=(8, 8))
ax.set_xlabel('Gender');
ax.set_ylabel('Number of shares');

#### 3.2.1.2 Only genders

In [None]:
groups = data.groupby(['from_gender']).size()
groups = groups.drop(['andy', 'unknown'], axis=0)
groups['female'] += groups['mostly_female']
groups['male'] += groups['mostly_male']
groups = groups.drop(['mostly_female', 'mostly_male'], axis=0)

In [None]:
ax = groups.plot(kind = 'bar', grid = True, title = 'Reviews in tourist groups by gender', figsize=(8, 8))
ax.set_xlabel('Gender');
ax.set_ylabel('Number of shares');

### 3.2.2 Messages sentiments