In [103]:
answers_file = 'answers_2.csv'
questions_map = 'questions_map.txt'

In [104]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv(answers_file, delimiter=";")
print("Dataframe size: ", df.shape)
# for c in df.columns.values:
#     print("Null values (column {}): ".format(c), pd.isna(df[c]).sum())
df = df.dropna()
print("Purged dataframe size: ", df.shape)

Dataframe size:  (243, 37)
Purged dataframe size:  (155, 37)


In [105]:
# Columns remapping

mapping = open(questions_map, 'r')
questions = mapping.read().split(";")
mapping.close()
mapping = {before: after for before, after in zip(df.columns.values, questions)}
df = df.rename(columns=mapping)
df_original_copy = df.copy()

In [106]:
for c in df.columns.values:
    to_drop = []
    if df[c].value_counts().shape[0] == 1:
        print("Dropping {} because it's constant".format(c))
        to_drop.append(c)
    df = df.drop(columns=to_drop)

Dropping demo-num-have_ig because it's constant
Dropping bool-does_follow_infl because it's constant


In [107]:
# Re-map the categorical answers

from sklearn import preprocessing
les = {}

for c in [c for c in df.columns.values if 'cat-' in c]:
    le = preprocessing.LabelEncoder()
    le.fit(df[c].values)
    df[c] = le.transform(df[c].values)
    les[c] = le

In [108]:
# Features that are numerical, but needs to be handled
for c in [c for c in df.columns.values if 'num-' in c]:
    cc = df[c].astype(str).str.extract('([0-9]{1})', expand=False).str.strip().astype(int)
    df[c] = cc

In [109]:
# Custom mapping

import re

def map_followers(s):
    s = s.replace("1 Milione", "1000000").replace(".", "")
    p = re.compile("([0-9]+)")
    rng = [int(x) for x in p.findall(s)]
    return int(sum(rng)/len(rng))

def map_age(s):
    if '14' in s:
        return 0
    if '19' in s:
        return 1
    if '30' in s:
        return 2
    print(s)
    return -1

def map_ig_since(s):
    if 'Meno di 6' in s:
        return 0
    if 'Tra 6' in s:
        return 1
    if 'Tra 1' in s:
        return 2
    if 'Da più di 3' in s:
        return 3
    print(s)
    return -1

def map_ig_time(s):
    if 'Meno di' in s:
        return 0
    if 'Tra 1' in s:
        return 1
    if 'Tra 2' in s:
        return 2
    if 'Più di' in s:
        return 3
    print(s)
    return -1

def map_studies(s):
    if (s == "Medie") or (s == "Studente"):
        return 0
    if s == ("Licenza Media") or ("liceo" in s):
        return 1
    if (s == "Diploma") or ("Laurea" in s):
        return 2
    if s == "Laurea Triennale":
        return 3
    if "Laurea Specialistica" in s:
        return 4
    if "Master" in s:
        return 5
    print(s)
    return -1

def map_infl(s):
    if "N" in s:
        return 0
    if "S" in s:
        return 1
    print(s)
    return -1

mapping_functions = {
    'demo-special-studies': map_studies,
    'demo-special-ig_time_daily': map_ig_time,
    'demo-special-ig_since': map_ig_since,
    'demo-special-age': map_age,
    'special-infl_followers': map_followers,
}

for key, function in mapping_functions.items():
    df[key] = df[key].apply(function).astype(int)

In [110]:
# Focus on everything but multiple answers
df_min = df[[c for c in df.columns.values if 'mul-' not in c]]

# Let's define an entropy formula

import numpy as np
from math import log, e

def entropy(labels, base=None):
    """ Computes entropy of label distribution. """
    n_labels = len(labels)
    if n_labels <= 1:
        return 0
    value, counts = np.unique(labels, return_counts=True)
    probs = counts / n_labels
    n_classes = np.count_nonzero(probs)
    if n_classes <= 1:
        return 0
    ent = 0.
    # Compute entropy
    base = e if base is None else base
    for i in probs:
        ent -= i * log(i, base)
    return ent

means = {}
stds = {}
ents = {}
exts = {}

for c in df_min.columns.values:
    cc = df_min[c]
    c_nice = c.split("-")[-1]
    means[c_nice] = cc.mean()
    stds[c_nice] = cc.std()
    ents[c_nice] = entropy(cc.values)
    if 'num-' in c:
        exts[c_nice] = abs((cc.mean() - 3)/2)
        
# Let's adapt features names for logging
df_plot = df_min.rename(columns={c: c.split("-")[-1] for c in df.columns.values}).copy()

In [111]:
df_mul = df[[c for c in df.columns.values if 'mul-' in c]]

def map_s_discovered_infl(s):
    if 'interessi' in s:
        return 'interests'
    if 'amici' in s:
        return 'friends'
    if 'suggerimenti di Instagram' in s:
        return 'explore'
    if 'da altre pagine' in s:
        return 'algo_recommended'
    if 'dalla pagina di brand' in s:
        return 'sponsor'
    if 'sentito parlare o' in s:
        return 'wom'
    return '-1'

def map_s_verticals(s):
    if 'Viaggi' in s:
        return 'travel'
    if 'Fashion' in s:
        return 'fashion'
    if 'Make' in s:
        return 'beauty'
    if 'Tech' in s:
        return 'tech'
    if 'Fitness' in s:
        return 'fitness'
    if 'Food' in s:
        return 'food'
    return '-1'

def map_multiples(s, fun):
    qs = s.split(", ")
    rs = []
    for q in qs:
        rs.append(fun(q))
    return ','.join(rs)

mapping_functions = {
    'mul-how_discovered_infl': map_s_discovered_infl,
    'mul-infl_verticals': map_s_verticals,
}
for key, function in mapping_functions.items():
    df_mul[key] = df_mul[key].apply(lambda x: map_multiples(x, function)).astype(str)

set_keys = {}
for c in df_mul.columns.values:
    set_keys[c] = [x for x in set(','.join(list(df_mul[c].values)).split(","))]
    
for c, values in set_keys.items():
    for new_col in values:
        if new_col == "-1":
            continue
        rs = []
        for row in range(df_mul.shape[0]):
            rs.append((new_col in df_mul.iloc[row, df_mul.columns.get_loc(c)]) *1)
        df_mul[new_col] = rs

df_mul = df_mul.drop(columns=[c for c in df_mul.columns.values if 'mul-' in c])

df_sdt = df_plot.join(df_mul)

In [112]:
df_sdt.head()

Unnamed: 0,age,studies,gender,ig_since,ig_time_daily,follow_friends,share_stuff,follow_infl,follow_brands,follow_vips,...,explore,interests,wom,algo_recommended,tech,fashion,food,fitness,beauty,travel
4,1,2,1,1,1,4,2,2,3,3,...,1,0,0,0,1,0,0,0,0,1
6,1,2,0,1,1,4,2,4,3,3,...,0,1,0,1,0,1,0,0,1,0
7,1,2,0,2,2,3,2,4,2,4,...,0,0,1,0,0,1,0,0,0,1
8,1,2,0,1,1,4,2,3,2,3,...,1,0,0,1,0,1,1,0,0,0
11,1,2,0,2,0,4,2,4,3,2,...,0,0,1,0,0,1,0,0,1,0


In [113]:
df_sdt.describe()

Unnamed: 0,age,studies,gender,ig_since,ig_time_daily,follow_friends,share_stuff,follow_infl,follow_brands,follow_vips,...,explore,interests,wom,algo_recommended,tech,fashion,food,fitness,beauty,travel
count,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,...,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0,155.0
mean,0.819355,1.812903,0.232258,2.483871,1.2,3.529032,2.709677,3.109677,2.741935,3.058065,...,0.309677,0.432258,0.387097,0.483871,0.090323,0.767742,0.2,0.251613,0.406452,0.354839
std,0.433514,0.506927,0.438702,0.758971,0.817027,0.667518,0.789277,0.886901,0.910535,0.846986,...,0.463859,0.496996,0.488665,0.50136,0.287573,0.423642,0.401297,0.435347,0.492763,0.480015
min,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,2.0,0.0,2.0,1.0,3.0,2.0,2.5,2.0,2.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,1.0,2.0,0.0,3.0,1.0,4.0,3.0,3.0,3.0,3.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,1.0,2.0,0.0,3.0,2.0,4.0,3.0,4.0,3.0,4.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.5,1.0,1.0
max,2.0,5.0,2.0,3.0,3.0,4.0,4.0,4.0,4.0,4.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [114]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_ = df_sdt.copy()
df_[df_.columns] = scaler.fit_transform(df_[df_.columns])
df_.head()

Unnamed: 0,age,studies,gender,ig_since,ig_time_daily,follow_friends,share_stuff,follow_infl,follow_brands,follow_vips,...,explore,interests,wom,algo_recommended,tech,fashion,food,fitness,beauty,travel
4,0.5,0.4,0.5,0.333333,0.333333,1.0,0.333333,0.333333,0.666667,0.666667,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
6,0.5,0.4,0.0,0.333333,0.333333,1.0,0.333333,1.0,0.666667,0.666667,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0
7,0.5,0.4,0.0,0.666667,0.666667,0.666667,0.333333,1.0,0.333333,1.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
8,0.5,0.4,0.0,0.333333,0.333333,1.0,0.333333,0.666667,0.333333,0.666667,...,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
11,0.5,0.4,0.0,0.666667,0.0,1.0,0.333333,1.0,0.666667,0.333333,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [115]:
ents_srt = sorted(ents.items(), key=lambda x: x[1])
exts_srt = sorted(exts.items(), key=lambda x: x[1])[-5:]

In [116]:
for c in df_original_copy.columns.values:
    print(c)

demo-special-age
demo-special-studies
demo-cat-gender
demo-num-have_ig
demo-special-ig_since
demo-special-ig_time_daily
act-num-follow_friends
act-num-share_stuff
act-num-follow_infl
act-num-follow_brands
act-num-follow_vips
act-num-share_brands
act-num-chat
bool-does_follow_infl
mul-how_discovered_infl
mul-infl_verticals
special-infl_followers
because-num-personality
because-num-body
because-num-topics
because-num-many_followers
because-num-everybody_does
because-num-brand_tips
do-num-trust
do-num-follow_advices
do-num-referral
do-num-think_infl_experts
do-num-think_trushworthy
do-num-think_really_tried
num-visit_sponsored_page
num-interact_w_sponsored_brand
num-do_referral_brand
num-cross_information
num-direct_brand_communication
num-did_buy_via_infl
num-did_referred_sponsored_item
num-wannabe_infl
