# Importazione dati

In [1]:
from google.colab import drive 
import pandas as pd
import re
from glob import glob
import numpy as np
from math import exp

from sklearn import svm
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.metrics import classification_report
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import ConfusionMatrixDisplay

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTENC

import scipy

In [2]:
drive.mount('/content/gdrive')
suspicious_removed = False # Utilizzata dopo

Mounted at /content/gdrive


Perché il seguente comando funzioni è necessario aggiungere una scorciatoia a DS Lab - PROGETTO al proprio Drive.

In [None]:
#!cp -av '/content/gdrive/My Drive/DS Lab - PROGETTO/Dataset aggregati' 'campaignClickDataset'

Lista dei file contenenti i dati

In [None]:
#part_files = sorted(glob('campaignClickDataset/part*.csv'))

In [None]:
#df = pd.concat((pd.read_csv(file) for file in part_files), ignore_index=True)

In [None]:
# Export in csv
#pd.DataFrame.to_csv(df.set_index('ad_form_id'), '/content/gdrive/My Drive/DS Lab - PROGETTO/Ottobre/dataset_completo.csv')

Importazione dei dati, solo csv completo

In [3]:
df = pd.read_csv('/content/gdrive/My Drive/DS Lab - PROGETTO/Ottobre/dataset_completo.csv')

In [4]:
df.head()

Unnamed: 0,ad_form_id,suspicious,clicks,impressions,buy,os_android,os_bsd,os_ios,os_linux,os_osx,...,feelings1_surprise,feelings1_sympathy,feelings1_thirst,feelings1_thoughtful,feelings1_torment,feelings1_traditionalist,feelings1_trust,feelings1_uncategorized,feelings1_violence,feelings1_wealth
0,7241135337730459930,0,0,1,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8408205703932483258,0,0,1,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5884145627470739741,0,0,2,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2463214613830580928,0,0,6,0,1,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,6157419832685345406,0,0,3,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
print(f'Utenti totali: {len(df)}\nUtenti sospetti: {len(df[df["suspicious"] == 1])}\nUtenti non sospetti: {len(df[df["suspicious"] == 0])}')

Utenti totali: 82564
Utenti sospetti: 152
Utenti non sospetti: 82412


# Colonne
Idee per la rimozione delle colonne:
* soli 0
  * buy
  * sentiments
* identiche informazioni
  * time2, stessa informazione di time1
* correlazione + identiche informazioni
  * 57% degli admants correlate con categories1 -> dopo aver eliminato le categories che causano i nan nel calcolo della correlazione i risultati non sono cambiati
* uguaglianza delle distribuzioni + identiche informazioni
  * categories 1, 2 e 3 => rappresentano le stesse informazioni, ma le c1 sono più diverse a livello di distribuzione tra click e non click rispetto a c2 e c3
  * feelings => la quasi totalità delle colonne (93%) ha la stessa distribuzione tra click e non click (sono di fatto quasi tutti 0) -> la percentuale di 0 nelle colonne è minore del 99% solo in 4 casi: due colonne hanno % di 0 pari a circa il 98%, mentre le altre 2 hanno % di 0 minore del 77%. Queste ultime due colonne hanno anche distribuzione diversa tra click e non click e % di 0 diverse tra click e non click. Una possibile idea è quella di mantenere 3 colonne, queste due più una terza, feelings_others, in cui accorpiamo tutti gli altri feelings. Tuttavia, più di 44000 righe hanno tutti i feelings pari a 0 e questo sembra una mancanza di informazione importante e quindi si può anche valutare di eliminare tutte le colonne feelings.

In [None]:
colonne = list(df.columns)
for colonna in colonne:
  print(colonna)

ad_form_id
suspicious
clicks
impressions
buy
os_android
os_bsd
os_ios
os_linux
os_osx
os_other
os_windows
device_type
browser_android
browser_chrome
browser_chromium
browser_edge
browser_firefox
browser_ie
browser_opera
browser_other
browser_safari
browser_unknown
time1_workday_morning
time1_workday_afternoon
time1_workday_evening
time1_workday_night
time1_weekend_morning
time1_weekend_afternoon
time1_weekend_evening
time1_weekend_night
time2_morning_early
time2_morning
time2_launch
time2_afternoon
time2_evening
time2_night
time2_sleep
L00_50
L51_100
L101_250
L251_500
L501_1000
L1001_2500
L2501_5000
L5001_10000
L10001_more
categories1_artandentertainment
categories1_automotive
categories1_business
categories1_careers
categories1_education
categories1_emotions
categories1_familyandparenting
categories1_finance
categories1_foodanddrink
categories1_healthandfitness
categories1_hobbiesandinterests
categories1_homeandgarden
categories1_intentions
categories1_lawgovtandpolitics
categories1_n

## Buy

Togliamo buy perché è totalmente popolata da 0 e eventuali valori pari a 1 indicherebbero che una persona ha cliccato dopo aver ricevuto la pubblicità => è un'informazione che non abbiamo prima di aver mandato la pubblicità.

In [None]:
df.drop('buy', axis=1, inplace=True)

## Variabili Lunghezza
Per ora teniamo le colonne come sono, senza aggregarle. Poi eventualmente le aggreghiamo sulla base dei risultati di SVM.

### Normalizzazione

Normalizziamo le colonne (esattamente come abbiamo fatto a luglio).

Dall'analisi risulta che si sono circa 20000 righe con valori mancanti (nan o infiniti) per almeno una delle colonne. L'eliminazione la facciamo sulla base della somma (eliminiamo se è nan o infinito).

In [None]:
L = re.compile('^L+.*', re.IGNORECASE)
lung = []

for v in df.columns:
  if L.search(v):
    print(v)
    lung.append(v)

L00_50
L51_100
L101_250
L251_500
L501_1000
L1001_2500
L2501_5000
L5001_10000
L10001_more


In [None]:
# Esempio di ciò che succede coi NaN
df[df['L101_250'].isnull()][lung]

Unnamed: 0,L00_50,L51_100,L101_250,L251_500,L501_1000,L1001_2500,L2501_5000,L5001_10000,L10001_more
0,,,,,,,,,
2,inf,,,,,,,,
3,inf,,,,,,,,
6,inf,,,,,,,,
11,inf,,,,,,,,
...,...,...,...,...,...,...,...,...,...
82529,inf,,,,,,,,
82532,,,,,,,,,
82534,inf,,,,,,,,
82535,inf,,,,,,,,


In [None]:
# Somma dei valori nelle colonne
def somma_lung(riga):
  sum = 0
  for el in lung:
    sum += riga[el]
  return sum

df['somma'] = df.apply(somma_lung, axis=1)
df[['somma']]

Unnamed: 0,somma
0,
1,237.5
2,
3,
4,100.0
...,...
82559,100.0
82560,100.0
82561,100.0
82562,100.0


In [None]:
# Verifichiamo se ci sono valori infiniti
from numpy import inf

sum(df['somma'] == inf) 

0

In [None]:
# Verifichiamo se ci sono valori nulli
sum(df['somma'].isnull())

20033

Eliminiamo ora tutte queste righe.

In [None]:
indexNames = df[(df['somma'].isnull())].index
df.drop(indexNames, inplace=True)
df['somma'].shape

(62531,)

Ora normalizziamo

In [None]:
for l in lung:
  df[l] = round((df[l] / df['somma']) * 100, 2)

df['somma'] = df.apply(somma_lung, axis=1)

In [None]:
# controllo
df[['L00_50', 'L51_100', 'L101_250', 'L251_500', 'L501_1000', 'L1001_2500', 'L2501_5000', 'L5001_10000', 'L10001_more', 'somma']]

Unnamed: 0,L00_50,L51_100,L101_250,L251_500,L501_1000,L1001_2500,L2501_5000,L5001_10000,L10001_more,somma
1,100.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
4,100.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
5,0.00,0.00,0.0,100.0,0.0,0.0,0.0,0.0,0.0,100.0
7,66.67,33.33,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
8,42.86,57.14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
...,...,...,...,...,...,...,...,...,...,...
82559,0.00,100.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
82560,100.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
82561,100.00,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
82562,11.11,88.89,0.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0


In [None]:
# Delete coloumn 'somma'
df.drop(columns=['somma'], inplace=True)

In [None]:
# Controllo se ci sono colonne con valori pari a inf tra tutte le colonne, non solo le lunghezze
for col in df.columns:
  if len(df[df[col] == np.inf]) != 0:
    print(col)

### Distribuzione (?)

Per il momento le lasciamo così come sono. Se dovessimo ottenere dei risultati non buoni, allora vediamo di aggregarle:
* presa una colonna dividiamo la popolazione dei click e dei non click, costruiamo istogrammi e valutiamo se le medie sono statisticamente uguali.

In [None]:
lungh = ('L00_50', 'L51_100', 'L101_250', 'L251_500', 'L501_1000', 'L1001_2500', 'L2501_5000', 'L5001_10000', 'L10001_more')

In [None]:
alfa = 0.01

In [None]:
# per funzionamento guardare quanto scritto per i time1

for t in lungh:
  t1 = df[t]
  t_yes = df[(df['clicks'] > 0)][t]
  t_no = df[(df['clicks'] <= 0)][t]
  
  zinyes=0
  for prob in t_yes:
    if prob <= 0:
      zinyes = zinyes + 1
  
  zinno=0
  for prob in t_no:
    if prob <= 0:
      zinno = zinno + 1
  print(t, ':')
  print('media totale: ', t1.mean())
  print('media dei click: ', round(t_yes.mean(), 3), '          click con probabilità 0: ', zinyes/len(t_yes))
  print('media dei non click: ', round(t_no.mean(), 3), '      non click con probabilità 0: ', zinno/len(t_no))
  p_v = scipy.stats.ttest_ind(t_yes, t_no)[1]
  print('p-value: ', round(p_v, 3))
  if p_v > alfa:
    print('-> medie uguali')
  else:
    print('-> medie diverse')

  print('')

L00_50 :
media totale:  43.82703283171547
media dei click:  37.116           click con probabilità 0:  0.6
media dei non click:  43.85       non click con probabilità 0:  0.5266443093018405
p-value:  0.043
-> medie diverse

L51_100 :
media totale:  31.409021765204457
media dei click:  43.617           click con probabilità 0:  0.5333333333333333
media dei non click:  31.368       non click con probabilità 0:  0.649283548081706
p-value:  0.0
-> medie diverse

L101_250 :
media totale:  13.447622299339528
media dei click:  6.717           click con probabilità 0:  0.9095238095238095
media dei non click:  13.47       non click con probabilità 0:  0.8285168723223312
p-value:  0.003
-> medie diverse

L251_500 :
media totale:  6.97666517407366
media dei click:  8.965           click con probabilità 0:  0.8714285714285714
media dei non click:  6.97       non click con probabilità 0:  0.9092601209865053
p-value:  0.235
-> medie uguali

L501_1000 :
media totale:  1.751839087812445
media dei clic

## Variabili os, browser e device

Tipi:
* os e browser sono composte da più variabili binarie;
* device è categorica, quindi si occuperà `OneHotEncoder` di binarizzarla.

Possiamo accorpare sulla base della loro distribuzione.

### os Distribuzione

In [None]:
os = ('os_android', 'os_bsd', 'os_ios', 'os_linux', 'os_osx', 'os_other', 'os_windows')

In [None]:
for tipo in os:
  yes = df[df['clicks'] > 0][tipo]
  no = df[df['clicks'] <= 0][tipo]

  zinyes = 0
  for r in yes:
    if r <= 0:
      zinyes = zinyes +1

  zinno = 0
  for r in no:
    if r <= 0:
      zinno = zinno +1

  print(tipo, ':')
  print('clicker che usano questo tipo: ', len(yes)-zinyes, 'su ', len(yes), ' -> ', 1-zinyes/len(yes))
  print('non clicker che usano questo tipo: ', len(no)-zinno, 'su ', len(no), ' -> ', 1-zinno/len(no))
  print('')

os_android :
clicker che usano questo tipo:  15 su  210  ->  0.0714285714285714
non clicker che usano questo tipo:  2199 su  62321  ->  0.03528505640153401

os_bsd :
clicker che usano questo tipo:  0 su  210  ->  0.0
non clicker che usano questo tipo:  1 su  62321  ->  1.604595561688349e-05

os_ios :
clicker che usano questo tipo:  0 su  210  ->  0.0
non clicker che usano questo tipo:  0 su  62321  ->  0.0

os_linux :
clicker che usano questo tipo:  2 su  210  ->  0.00952380952380949
non clicker che usano questo tipo:  461 su  62321  ->  0.007397185539384843

os_osx :
clicker che usano questo tipo:  26 su  210  ->  0.12380952380952381
non clicker che usano questo tipo:  9506 su  62321  ->  0.15253285409412554

os_other :
clicker che usano questo tipo:  3 su  210  ->  0.014285714285714235
non clicker che usano questo tipo:  120 su  62321  ->  0.0019255146740264628

os_windows :
clicker che usano questo tipo:  164 su  210  ->  0.780952380952381
non clicker che usano questo tipo:  50034 s

### browser Distribuzione

In [None]:
brow = ('browser_android', 'browser_chrome', 'browser_chromium', 'browser_edge', 'browser_firefox', 'browser_ie', 'browser_opera', 
        'browser_other', 'browser_safari', 'browser_unknown')

In [None]:
for tipo in brow:
  yes = df[df['clicks'] > 0][tipo]
  no = df[df['clicks'] <= 0][tipo]

  zinyes = 0
  for r in yes:
    if r <= 0:
      zinyes = zinyes +1

  zinno = 0
  for r in no:
    if r <= 0:
      zinno = zinno +1

  print(tipo, ':')
  print('clicker che usano questo tipo: ', len(yes)-zinyes, 'su ', len(yes), ' -> ', 1-zinyes/len(yes))
  print('non clicker che usano questo tipo: ', len(no)-zinno, 'su ', len(no), ' -> ', 1-zinno/len(no))
  print('')

browser_android :
clicker che usano questo tipo:  1 su  210  ->  0.004761904761904745
non clicker che usano questo tipo:  77 su  62321  ->  0.0012355385825002507

browser_chrome :
clicker che usano questo tipo:  124 su  210  ->  0.5904761904761905
non clicker che usano questo tipo:  40457 su  62321  ->  0.6491712263923878

browser_chromium :
clicker che usano questo tipo:  0 su  210  ->  0.0
non clicker che usano questo tipo:  0 su  62321  ->  0.0

browser_edge :
clicker che usano questo tipo:  47 su  210  ->  0.2238095238095238
non clicker che usano questo tipo:  9931 su  62321  ->  0.15935238523130246

browser_firefox :
clicker che usano questo tipo:  18 su  210  ->  0.08571428571428574
non clicker che usano questo tipo:  3516 su  62321  ->  0.056417579948973895

browser_ie :
clicker che usano questo tipo:  3 su  210  ->  0.014285714285714235
non clicker che usano questo tipo:  2766 su  62321  ->  0.044383113236308835

browser_opera :
clicker che usano questo tipo:  0 su  210  ->  0.

### device Distribuzione

In [None]:
len(df[df['clicks']>0])

210

In [None]:
for n in (1,2,3):
  users = len(df[df['device_type']==n])
  c_users = len( df[(df['device_type']==n) & (df['clicks']>0)] )
  nc_users = len( df[(df['device_type']==n) & (df['clicks']<=0)] )
  if(n==1): print('Mobile')
  if(n==2): print('Desktop and Laptop')
  if(n==3): print('Unknown/Others')
  print('clicker che usano questo device: ', c_users, 'su ', 210, ' -> ', c_users/210)
  print('non clicker che usano questo device: ', nc_users, 'su ', len(df)-210, ' -> ', nc_users/(len(df)-210))
  print('utilizzatori totali: ', users)
  print(' ')

Mobile
clicker che usano questo device:  15 su  210  ->  0.07142857142857142
non clicker che usano questo device:  2200 su  62321  ->  0.03530110235715088
utilizzatori totali:  2215
 
Desktop and Laptop
clicker che usano questo device:  192 su  210  ->  0.9142857142857143
non clicker che usano questo device:  60003 su  62321  ->  0.9628054748800565
utilizzatori totali:  60195
 
Unknown/Others
clicker che usano questo device:  3 su  210  ->  0.014285714285714285
non clicker che usano questo device:  118 su  62321  ->  0.0018934227627926382
utilizzatori totali:  121
 


## Variabili Time

Contenendo time1 e time2 informazioni simili (momento della giornata in cui la pagina è stata visitata), ma avendo time2 infromazioni mancanti per circa 3000 righe, si è deciso di mantenere le sole colonne di time1.

Viene quindi
* mostrato che le time2 hanno informazioni mancanti (somme a zero);
* mostrare che le time1 hanno tutte le informazioni.

Passi successivi:
* rimuovere effettivamente le Time2;
* normalizzare le Time1.


### Controlli

In [None]:
# colonne da utilizzare
colonne = df.columns.to_numpy()
t = re.compile('^time+.*', re.IGNORECASE)
t1 = re.compile('^time1+.*', re.IGNORECASE)
times1 = []
times2 = []

for v in colonne:
  if t.search(v):
    print(v)
    if t1.search(v):
      times1.append(v)
    else:
      times2.append(v)

time1_workday_morning
time1_workday_afternoon
time1_workday_evening
time1_workday_night
time1_weekend_morning
time1_weekend_afternoon
time1_weekend_evening
time1_weekend_night
time2_morning_early
time2_morning
time2_launch
time2_afternoon
time2_evening
time2_night
time2_sleep


In [None]:
# calcolo la somma dei tempi1 e tempi2

df['sommat1'] = (df['time1_workday_morning'] + df['time1_workday_afternoon'] + df['time1_workday_evening'] + df['time1_workday_night'] +
                 df['time1_weekend_morning'] + df['time1_weekend_afternoon'] + df['time1_weekend_evening'] + df['time1_weekend_night'])

df['sommat2'] = (df['time2_morning_early'] + df['time2_morning'] + df['time2_launch'] + df['time2_afternoon'] + df['time2_evening'] +
                 df['time2_night'] + df['time2_sleep'])

In [None]:
# non sono presenti somme infinite
len(df[(df['sommat1'] > 100000) | (df['sommat2'] > 100000)][['sommat1', 'sommat2']])

0

In [None]:
# sono presenti più di 3000 righe che mostrano somma per tempi2 pari a zero (13 hanno cliccato)
print(len(df[df['sommat2'] == 0][['sommat1', 'sommat2']]))
print(len(df[(df['sommat2'] == 0) & (df['clicks'] > 0)][['sommat1', 'sommat2']]))

3428
13


In [None]:
#  non sono presenti righe con somma a zero per tempi1
df[(df['sommat1'] == 0)][['sommat1', 'sommat2']]

Unnamed: 0,sommat1,sommat2


In [None]:
# non sono presenti valori nulli
print(len(df[df['sommat1'].isnull()]))
print(len(df[df['sommat2'].isnull()]))

0
0


### Normalizzazione time1, eliminazione time2

In [None]:
# Riscalo i tempi(1) in modo che somma sia 100 
for t2 in times1:
  df[t2] = round(df[t2]/ df['sommat1']*100, 2)

times1.append('sommat1')
df[times1].head(5)

Unnamed: 0,time1_workday_morning,time1_workday_afternoon,time1_workday_evening,time1_workday_night,time1_weekend_morning,time1_weekend_afternoon,time1_weekend_evening,time1_weekend_night,sommat1
1,26.32,73.68,0.0,0.0,0.0,0.0,0.0,0.0,237.5
4,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,100.0
5,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
7,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,100.0
8,14.29,14.29,71.43,0.0,0.0,0.0,0.0,0.0,175.0


In [None]:
# elimino colonne di time2 e delle somme
df.drop(columns=['sommat1', 'sommat2'], inplace=True)
df.drop(columns=times2, inplace=True)

### Distribuzioni

Per ogni colonna di time1:
* viene calcolata la media della probabilità 
* viene calcolata la media della probabilità per i soli clickers (clicks>0)
* viene calcolata la media della probabilità per i soli non clickers (clicks=0)
* viene stabilito se la media per i clickers e non clickers coincide (p-value > 0.05)
* viene stabilita la frazione di probabilità pari a zero tra i clickers
* viene stabilita la frazione di probabilità pari a zero tra i non clickers

In [None]:
alfa = 0.05

In [None]:
for t in times1[:-1]:
  t1 = df[t]
  t_yes = df[(df['clicks'] > 0)][t]
  t_no = df[(df['clicks'] <= 0)][t]
  
  zinyes=0
  for prob in t_yes:
    if prob <= 0:
      zinyes = zinyes + 1
  
  zinno=0
  for prob in t_no:
    if prob <= 0:
      zinno = zinno + 1
  print(t, ':')
  print('media totale: ', t1.mean())
  print('media dei click: ', round(t_yes.mean(), 3), '          click con probabilità 0: ', zinyes/len(t_yes))
  print('media dei non click: ', round(t_no.mean(), 3), '      non click con probabilità 0: ', zinno/len(t_no))
  p_v = scipy.stats.ttest_ind(t_yes, t_no)[1]
  print('p-value: ', round(p_v, 3))
  if p_v > alfa:
    print('-> medie uguali')
  else:
    print('-> medie diverse')

  print('')

time1_workday_morning :
media totale:  13.73848603092866
media dei click:  10.168           click con probabilità 0:  0.7428571428571429
media dei non click:  13.751       non click con probabilità 0:  0.7293689125655879
p-value:  0.068
-> medie uguali

time1_workday_afternoon :
media totale:  21.81065199660969
media dei click:  23.195           click con probabilità 0:  0.47619047619047616
media dei non click:  21.806       non click con probabilità 0:  0.5775741724298391
p-value:  0.538
-> medie uguali

time1_workday_evening :
media totale:  15.222247205386127
media dei click:  20.661           click con probabilità 0:  0.5380952380952381
media dei non click:  15.204       non click con probabilità 0:  0.6999245840086006
p-value:  0.007
-> medie diverse

time1_workday_night :
media totale:  8.601074826885865
media dei click:  11.242           click con probabilità 0:  0.7476190476190476
media dei non click:  8.592       non click con probabilità 0:  0.8247460727523628
p-value:  0.097

## Variabili Categories

Normalizziamo le colonne (come abbiamo fatto a luglio, includendo però anche le categories 2 e 3).

### Normalizzazione

#### Categories1

In [None]:
# Colonne categories1
cat1 = re.compile('categories1_+.*', re.IGNORECASE)
cats1 = []
for label in colonne:
  occ = cat1.findall(label)
  if occ:
    cats1.append(occ[0])
  
# Calcolo somma
def somma_cat1(riga):
  sum = 0
  for cat in cats1:
    sum += riga[cat]
  return(round(sum, 1))

df['somma1'] = df.apply(somma_cat1, axis=1)

In [None]:
# controllo quanti nan ci sono nelle somme: non ci sono nan

df[df['somma1'].isnull()][['ad_form_id', 'somma1']]

Unnamed: 0,ad_form_id,somma1


In [None]:
# controllo se ci sono nan nelle categorie singole: non ci sono

for v in cats1:
  nn = len(df[df[v].isnull()])
  if (nn != 0):
    print(v, nn)

In [None]:
# controllo quanti inf ci sono nelle somme: non ci sono inf
df[df['somma1'] == np.inf][['ad_form_id', 'somma1']]

Unnamed: 0,ad_form_id,somma1


Le righe che davano somme infinite nelle lunghezze, sono le stesse che a luglio davano somme infinite nelle categories.

In [None]:
# controllo gli zero: non ce ne sono
df[df['somma1'] == 0][['ad_form_id', 'somma1']]

Unnamed: 0,ad_form_id,somma1


In [None]:
# Normalizzo le somme a 100
for cat in cats1:
  df[cat] = round((df[cat] / df['somma1']) * 100, 2)

In [None]:
df[cats1]

Unnamed: 0,categories1_artandentertainment,categories1_automotive,categories1_business,categories1_careers,categories1_education,categories1_emotions,categories1_familyandparenting,categories1_finance,categories1_foodanddrink,categories1_healthandfitness,...,categories1_realestate,categories1_religionandspirituality,categories1_science,categories1_shopping,categories1_society,categories1_sports,categories1_styleandfashion,categories1_technologyandcomputing,categories1_travel,categories1_uncategorized
1,0.0,0.0,0.0,42.11,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.0,57.89
4,100.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.00
5,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.76,0.0,11.76,0.0,0.00
7,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,14.71,0.0,51.49,0.0,0.00
8,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,57.39,0.0,42.61
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82559,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,40.00,0.0,0.00
82560,100.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.00
82561,100.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.0,0.00
82562,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.77,0.0,42.96,0.0,0.00


In [None]:
df.drop('somma1', axis=1, inplace=True)

#### Categories 2

In [None]:
# Colonne categories2
cat2 = re.compile('categories2_+.*', re.IGNORECASE)
cats2 = []
for label in colonne:
  occ = cat2.findall(label)
  if occ:
    cats2.append(occ[0])
  
# Calcolo somma
def somma_cat2(riga):
  sum = 0
  for cat in cats2:
    sum += riga[cat]
  return(round(sum, 1))

df['somma2'] = df.apply(somma_cat2, axis=1)

In [None]:
# controllo quanti nan ci sono nelle somme: non ci sono nan

df[df['somma2'].isnull()][['ad_form_id', 'somma2']]

Unnamed: 0,ad_form_id,somma2


In [None]:
# controllo se ci sono nan nelle categorie singole: non ci sono

for v in cats2:
  nn = len(df[df[v].isnull()])
  if (nn != 0):
    print(v, nn)

In [None]:
# controllo quanti inf ci sono nelle somme: non ci sono inf
df[df['somma2'] == np.inf][['ad_form_id', 'somma2']]

Unnamed: 0,ad_form_id,somma2


Le righe che davano somme infinite nelle lunghezze, sono le stesse che a luglio davano somme infinite nelle categories.

In [None]:
# controllo gli zero: non ce ne sono
df[df['somma2'] == 0][['ad_form_id', 'somma2']]

Unnamed: 0,ad_form_id,somma2


In [None]:
# Normalizzo le somme a 100
for cat in cats2:
  df[cat] = round((df[cat] / df['somma2']) * 100, 2)

In [None]:
df[cats2]

Unnamed: 0,categories2_accessories,categories2_addiction,categories2_adoption,categories2_adulteducation,categories2_adventuretravel,categories2_advertising,categories2_agriculture,categories2_alternativemedicine,categories2_alternativereligions,categories2_americancuisine,...,categories2_waterskiwakeboard,categories2_weather,categories2_weightlifting,categories2_weightloss,categories2_welfare,categories2_windsurfing,categories2_womenshealth,categories2_work,categories2_wrestling,categories2_zoo
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.drop('somma2', axis=1, inplace=True)

#### Categories 3

In [None]:
# Colonne categories3
cat3 = re.compile('categories3_+.*', re.IGNORECASE)
cats3 = []
for label in colonne:
  occ = cat3.findall(label)
  if occ:
    cats3.append(occ[0])
  
# Calcolo somma
def somma_cat3(riga):
  sum = 0
  for cat in cats3:
    sum += riga[cat]
  return(round(sum, 1))

df['somma3'] = df.apply(somma_cat3, axis=1)

In [None]:
# controllo quanti nan ci sono nelle somme: non ci sono nan

df[df['somma3'].isnull()][['ad_form_id', 'somma3']]

Unnamed: 0,ad_form_id,somma3


In [None]:
# controllo se ci sono nan nelle categorie singole: non ci sono

for v in cats3:
  nn = len(df[df[v].isnull()])
  if (nn != 0):
    print(v, nn)

In [None]:
# controllo quanti inf ci sono nelle somme: non ci sono inf
df[df['somma3'] == np.inf][['ad_form_id', 'somma3']]

Unnamed: 0,ad_form_id,somma3


Le righe che davano somme infinite nelle lunghezze, sono le stesse che a luglio davano somme infinite nelle categories.

In [None]:
# controllo gli zero: non ce ne sono
df[df['somma3'] == 0][['ad_form_id', 'somma3']]

Unnamed: 0,ad_form_id,somma3


In [None]:
# Normalizzo le somme a 100
for cat in cats3:
  df[cat] = round((df[cat] / df['somma3']) * 100, 2)

In [None]:
df[cats3]

Unnamed: 0,categories3_712education,categories3_accessories,categories3_acoustics,categories3_active,categories3_addiction,categories3_adoption,categories3_adulteducation,categories3_adventuretravel,categories3_advertising,categories3_affection,...,categories3_weightloss,categories3_welfare,categories3_windows,categories3_windsurfing,categories3_womenshealth,categories3_woodworking,categories3_work,categories3_wrestling,categories3_zoo,categories3_zoology
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.drop('somma3', axis=1, inplace=True)

### Correlazione

Calcoliamo correlazione tra tutte le c2 e c1 e vediamo quella più alta e tra quale coppia di categorie si verifica tale valore. Facciamo la stessa cosa con le c3 e le c1 e eventualmente anche con le c3 e le c2.

Processo:
1. costruisco un vettore di dizionari in cui ogni dizionario ha tre chiavi, una per la categories2 considerata, una per la categories1 considerata e una per il coefficiente di correlazione tra le due categories; ho un dizionario per ogni coppia categories2/categories1;
2. trasformo questo vettore in una dataframe;
3. costruisco quindi un nuovo vettore di dizionari in cui ogni dizionario ha tre chiavi, una per la categories2, una per la categories1 che ha il massimo coefficiente di correlazine tra tutte le categories1 e il coefficiente di correlazione stesso;
4. stampo i coefficienti di correlazione significativi.

Tolgo colonne che causano i nan nella correlazione e stampo quali sono.

In [None]:
# Soglia oltre la quale consideriamo significativa una correlazione
soglia_corr = 0.7

#### Categories 1 con Categories 2

In [None]:
# Punto 1
c12 = []
for cat2 in cats2:
    for cat1 in cats1:
        diz = {}
        diz['C2'] = cat2
        diz['C1'] = cat1
        diz['Corr'] = np.corrcoef(df[cat2], df[cat1])[0][1]
        c12.append(diz)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
# Punto 2
df12 = pd.DataFrame(c12)
df12.head()

Unnamed: 0,C2,C1,Corr
0,categories2_accessories,categories1_artandentertainment,-0.007111
1,categories2_accessories,categories1_automotive,-0.000969
2,categories2_accessories,categories1_business,0.005856
3,categories2_accessories,categories1_careers,-0.00151
4,categories2_accessories,categories1_education,0.005294


In [None]:
# Punto 3
c12 = []
for cat in cats2:
    diz = {}
    diz['C2'] = cat
    diz['C1'] = list(df12[df12['C2'] == cat].sort_values('Corr', ascending=False)['C1'])[0]
    diz['Corr'] = list(df12[df12['C2'] == cat].sort_values('Corr', ascending=False)['Corr'])[0]
    c12.append(diz)

In [None]:
# Punto 4
index = 0 # uso per valutare le correlazioni significative
for diz in c12:
    print(f"{diz['C2']} - {diz['C1']} -> {diz['Corr']}")
    if diz['Corr'] > soglia_corr:
        index += 1
print(f"Totale: {len(c12)}\nCorrelazioni significative: {index}\nCorrelazioni non significative: {len(c12) - index}")

categories2_accessories - categories1_styleandfashion -> 0.13157247713518905
categories2_addiction - categories1_healthandfitness -> 0.08555438500376396
categories2_adoption - categories1_familyandparenting -> 0.00856229607531587
categories2_adulteducation - categories1_travel -> 0.0288544374827147
categories2_adventuretravel - categories1_artandentertainment -> nan
categories2_advertising - categories1_business -> 0.2830075808018205
categories2_agriculture - categories1_business -> 0.09727772074629354
categories2_alternativemedicine - categories1_healthandfitness -> 0.10772546904002196
categories2_alternativereligions - categories1_artandentertainment -> nan
categories2_americancuisine - categories1_artandentertainment -> nan
categories2_animalwelfare - categories1_shopping -> 0.5006225384227991
categories2_antique - categories1_artandentertainment -> nan
categories2_apartments - categories1_realestate -> 0.2958086803308007
categories2_appliances - categories1_homeandgarden -> 0.33137

#### Categories 1 con Categories 3

In [None]:
# Punto 1
c13 = []
for cat3 in cats3:
    for cat1 in cats1:
        diz = {}
        diz['C1'] = cat1
        diz['C3'] = cat3
        diz['Corr'] = np.corrcoef(df[cat1], df[cat3])[0][1]
        c13.append(diz)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
# Punto 2
df13 = pd.DataFrame(c13)
df13.head()

Unnamed: 0,C1,C3,Corr
0,categories1_artandentertainment,categories3_712education,-0.008719
1,categories1_automotive,categories3_712education,-0.001709
2,categories1_business,categories3_712education,-0.000287
3,categories1_careers,categories3_712education,-0.00281
4,categories1_education,categories3_712education,0.159539


In [None]:
# Punto 3
c13 = []
for cat in cats3:
    diz = {}
    diz['C3'] = cat
    diz['C1'] = list(df13[df13['C3'] == cat].sort_values('Corr', ascending=False)['C1'])[0]
    diz['Corr'] = list(df13[df13['C3'] == cat].sort_values('Corr', ascending=False)['Corr'])[0]
    c13.append(diz)

In [None]:
# Punto 4
index = 0 # uso per valutare le correlazioni significative
for diz in c13:
    print(f"{diz['C3']} - {diz['C1']} -> {diz['Corr']}")
    if diz['Corr'] > soglia_corr:
        index += 1
print(f"Totale: {len(c13)}\nCorrelazioni significative: {index}\nCorrelazioni non significative: {len(c13) - index}")

categories3_712education - categories1_education -> 0.15953938063261586
categories3_accessories - categories1_styleandfashion -> 0.13157247713518905
categories3_acoustics - categories1_artandentertainment -> nan
categories3_active - categories1_artandentertainment -> nan
categories3_addiction - categories1_shopping -> 0.14743753021889075
categories3_adoption - categories1_familyandparenting -> 0.00856229607531587
categories3_adulteducation - categories1_travel -> 0.028854437482714705
categories3_adventuretravel - categories1_artandentertainment -> nan
categories3_advertising - categories1_business -> 0.2830075808018204
categories3_affection - categories1_artandentertainment -> nan
categories3_africa - categories1_artandentertainment -> 0.0054176044214810735
categories3_agriculture - categories1_business -> 0.09625217686343518
categories3_aidshiv - categories1_society -> 0.020417292461199103
categories3_airforce - categories1_society -> 0.12952035267953596
categories3_airfreight - categ

#### Categories 2 con Categories 3

In [None]:
# Punto 1
# Esecuzione lunghissima

c23 = []
for cat2 in cats2:
    for cat3 in cats3:
        diz = {}
        diz['C2'] = cat2
        diz['C3'] = cat3
        diz['Corr'] = np.corrcoef(df[cat2], df[cat3])[0][1]
        c23.append(diz)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
# Punto 2
df23 = pd.DataFrame(c23)
df23.head()

Unnamed: 0,C2,C3,Corr
0,categories2_accessories,categories3_712education,-0.000211
1,categories2_accessories,categories3_accessories,1.0
2,categories2_accessories,categories3_acoustics,
3,categories2_accessories,categories3_active,
4,categories2_accessories,categories3_addiction,-4.4e-05


In [None]:
# Punto 3
c23 = []
for cat in cats3:
    diz = {}
    diz['C3'] = cat
    diz['C2'] = list(df23[df23['C3'] == cat].sort_values('Corr', ascending=False)['C2'])[0]
    diz['Corr'] = list(df23[df23['C3'] == cat].sort_values('Corr', ascending=False)['Corr'])[0]
    c23.append(diz)

In [None]:
# Punto 4
index = 0 # uso per valutare le correlazioni significative
for diz in c23:
    print(f"{diz['C3']} - {diz['C2']} -> {diz['Corr']}")
    if diz['Corr'] > soglia_corr:
        index += 1
print(f"Totale: {len(c23)}\nCorrelazioni significative: {index}\nCorrelazioni non significative: {len(c23) - index}")

categories3_712education - categories2_graduateschool -> 0.4493159603615543
categories3_accessories - categories2_accessories -> 1.0
categories3_acoustics - categories2_accessories -> nan
categories3_active - categories2_accessories -> nan
categories3_addiction - categories2_addiction -> 0.3904049945499505
categories3_adoption - categories2_adoption -> 1.0
categories3_adulteducation - categories2_adulteducation -> 0.9999999999999998
categories3_adventuretravel - categories2_accessories -> nan
categories3_advertising - categories2_advertising -> 1.0
categories3_affection - categories2_accessories -> nan
categories3_africa - categories2_touristdestinations -> 0.0315748435411293
categories3_agriculture - categories2_agriculture -> 0.9885604504854391
categories3_aidshiv - categories2_disease -> 0.037511015977540835
categories3_airforce - categories2_armedforces -> 0.4389326962036355
categories3_airfreight - categories2_accessories -> nan
categories3_airtravel - categories2_transports -> 0.

### Distribuzione

https://machinelearningmastery.com/statistical-hypothesis-tests-in-python-cheat-sheet/

In [None]:
df_y = df[df['clicks'] != 0]
df_n = df[df['clicks'] == 0]

#### Categories 1

In [None]:
dist_c1 = [] # vettore di dizionari in cui salvo il p-value del test di student
conteggio_distr_identiche_c1 = 0
for cat in cats1:
    diz = {}

    _, p = scipy.stats.mannwhitneyu(df_y[cat], df_n[cat])

    diz['C1'] = cat
    diz['p-value'] = p

    if diz['p-value'] > alfa:
        diz['Esito'] = 'Distribuzione identica'
        conteggio_distr_identiche_c1 += 1
    
    else:
        diz['Esito'] = 'Distribuzione diversa'
    
    dist_c1.append(diz)

#### Categories 2

In [None]:
dist_c2 = [] # vettore di dizionari in cui salvo il p-value del test di student
conteggio_distr_identiche_c2 = 0
for cat in cats2:
    diz = {}

    _, p = scipy.stats.mannwhitneyu(df_y[cat], df_n[cat])

    diz['C2'] = cat
    diz['p-value'] = p

    if diz['p-value'] > alfa:
        diz['Esito'] = 'Distribuzione identica'
        conteggio_distr_identiche_c2 += 1
    
    else:
        diz['Esito'] = 'Distribuzione diversa'
    
    dist_c2.append(diz)

#### Categories 3

In [None]:
dist_c3 = [] # vettore di dizionari in cui salvo il p-value del test di student
conteggio_distr_identiche_c3 = 0
for cat in cats3:
    diz = {}

    _, p = scipy.stats.mannwhitneyu(df_y[cat], df_n[cat])

    diz['C3'] = cat
    diz['p-value'] = p

    if diz['p-value'] > alfa:
        diz['Esito'] = 'Distribuzione identica'
        conteggio_distr_identiche_c3 += 1
    
    else:
        diz['Esito'] = 'Distribuzione diversa'
    
    dist_c3.append(diz)

#### Confronto

In [None]:
print(f"Frazione di Categories 1 con distribuzione identica tra click e non: {round(conteggio_distr_identiche_c1 / len(cats1), 2)}")
print(f"Frazione di Categories 2 con distribuzione identica tra click e non: {round(conteggio_distr_identiche_c2 / len(cats2), 2)}")
print(f"Frazione di Categories 3 con distribuzione identica tra click e non: {round(conteggio_distr_identiche_c3 / len(cats3), 2)}")

Frazione di Categories 1 con distribuzione identica tra click e non: 0.85
Frazione di Categories 2 con distribuzione identica tra click e non: 0.96
Frazione di Categories 3 con distribuzione identica tra click e non: 0.97


#### Analisi della frazione di 0 delle categories 1

##### Click e non click separati

In [None]:
zeri_cnc = []
for cat in cats1:
  diz = {}
  zeri_y = len(df_y[df_y[cat] == 0])
  zeri_n = len(df_n[df_n[cat] == 0])

  diz['Categories1'] = cat
  diz['% 0 tra i click'] = round(zeri_y / len(df_y), 3)
  diz['% 0 tra i non click'] = round(zeri_n / len(df_n), 3)

  zeri_cnc.append(diz)

In [None]:
for diz in zeri_cnc:
  print(diz)

{'Categories1': 'categories1_artandentertainment', '% 0 tra i click': 0.767, '% 0 tra i non click': 0.632}
{'Categories1': 'categories1_automotive', '% 0 tra i click': 0.995, '% 0 tra i non click': 0.99}
{'Categories1': 'categories1_business', '% 0 tra i click': 0.976, '% 0 tra i non click': 0.972}
{'Categories1': 'categories1_careers', '% 0 tra i click': 0.986, '% 0 tra i non click': 0.976}
{'Categories1': 'categories1_education', '% 0 tra i click': 0.995, '% 0 tra i non click': 0.99}
{'Categories1': 'categories1_emotions', '% 0 tra i click': 1.0, '% 0 tra i non click': 1.0}
{'Categories1': 'categories1_familyandparenting', '% 0 tra i click': 0.99, '% 0 tra i non click': 0.99}
{'Categories1': 'categories1_finance', '% 0 tra i click': 0.995, '% 0 tra i non click': 0.986}
{'Categories1': 'categories1_foodanddrink', '% 0 tra i click': 0.981, '% 0 tra i non click': 0.988}
{'Categories1': 'categories1_healthandfitness', '% 0 tra i click': 0.995, '% 0 tra i non click': 0.993}
{'Categories1'

Verifico se ci sono colonne con % di 0 minore di 0.99.

In [None]:
index = 0 # conto quante categories rispettano la condizione
for diz in zeri_cnc:
  if diz['% 0 tra i click'] < 0.99 or diz['% 0 tra i non click'] < 0.99:
    index += 1
    print(diz)

{'Categories1': 'categories1_artandentertainment', '% 0 tra i click': 0.767, '% 0 tra i non click': 0.632}
{'Categories1': 'categories1_business', '% 0 tra i click': 0.976, '% 0 tra i non click': 0.972}
{'Categories1': 'categories1_careers', '% 0 tra i click': 0.986, '% 0 tra i non click': 0.976}
{'Categories1': 'categories1_finance', '% 0 tra i click': 0.995, '% 0 tra i non click': 0.986}
{'Categories1': 'categories1_foodanddrink', '% 0 tra i click': 0.981, '% 0 tra i non click': 0.988}
{'Categories1': 'categories1_hobbiesandinterests', '% 0 tra i click': 0.467, '% 0 tra i non click': 0.691}
{'Categories1': 'categories1_lawgovtandpolitics', '% 0 tra i click': 0.99, '% 0 tra i non click': 0.989}
{'Categories1': 'categories1_news', '% 0 tra i click': 0.986, '% 0 tra i non click': 0.988}
{'Categories1': 'categories1_pets', '% 0 tra i click': 0.986, '% 0 tra i non click': 0.983}
{'Categories1': 'categories1_science', '% 0 tra i click': 0.957, '% 0 tra i non click': 0.909}
{'Categories1': 

In [None]:
print(f"Numero categories1 con % zeri < 0.99 tra i click o tra i non click: {index}")
print(f"Totale categories1 {len(cats1)}")
print(f"Percentuale: {round(index / len(cats1), 3) * 100}%")

Numero categories1 con % zeri < 0.99 tra i click o tra i non click: 16
Totale categories1 26
Percentuale: 61.5%


##### Click e non click uniti

In [None]:
zeri_tot = []
for cat in cats1:
  diz = {}
  zeri = len(df[df[cat] == 0])

  diz['Categories1'] = cat
  diz['% 0'] = round(zeri / len(df), 3)

  zeri_tot.append(diz)

In [None]:
for diz in zeri_tot:
  print(diz)

{'Categories1': 'categories1_artandentertainment', '% 0': 0.632}
{'Categories1': 'categories1_automotive', '% 0': 0.99}
{'Categories1': 'categories1_business', '% 0': 0.972}
{'Categories1': 'categories1_careers', '% 0': 0.976}
{'Categories1': 'categories1_education', '% 0': 0.99}
{'Categories1': 'categories1_emotions', '% 0': 1.0}
{'Categories1': 'categories1_familyandparenting', '% 0': 0.99}
{'Categories1': 'categories1_finance', '% 0': 0.986}
{'Categories1': 'categories1_foodanddrink', '% 0': 0.988}
{'Categories1': 'categories1_healthandfitness', '% 0': 0.993}
{'Categories1': 'categories1_hobbiesandinterests', '% 0': 0.69}
{'Categories1': 'categories1_homeandgarden', '% 0': 0.994}
{'Categories1': 'categories1_intentions', '% 0': 0.993}
{'Categories1': 'categories1_lawgovtandpolitics', '% 0': 0.989}
{'Categories1': 'categories1_news', '% 0': 0.988}
{'Categories1': 'categories1_pets', '% 0': 0.983}
{'Categories1': 'categories1_realestate', '% 0': 0.998}
{'Categories1': 'categories1_rel

Verifico se ci sono colonne con % di 0 minore di 0.99.

In [None]:
index = 0 # uso per contare le categories che rispettano la condizione qui sopra
for diz in zeri_tot:
  if diz['% 0'] < 0.99:
    index += 1
    print(diz)

{'Categories1': 'categories1_artandentertainment', '% 0': 0.632}
{'Categories1': 'categories1_business', '% 0': 0.972}
{'Categories1': 'categories1_careers', '% 0': 0.976}
{'Categories1': 'categories1_finance', '% 0': 0.986}
{'Categories1': 'categories1_foodanddrink', '% 0': 0.988}
{'Categories1': 'categories1_hobbiesandinterests', '% 0': 0.69}
{'Categories1': 'categories1_lawgovtandpolitics', '% 0': 0.989}
{'Categories1': 'categories1_news', '% 0': 0.988}
{'Categories1': 'categories1_pets', '% 0': 0.983}
{'Categories1': 'categories1_science', '% 0': 0.909}
{'Categories1': 'categories1_society', '% 0': 0.989}
{'Categories1': 'categories1_sports', '% 0': 0.817}
{'Categories1': 'categories1_styleandfashion', '% 0': 0.988}
{'Categories1': 'categories1_technologyandcomputing', '% 0': 0.561}
{'Categories1': 'categories1_travel', '% 0': 0.977}
{'Categories1': 'categories1_uncategorized', '% 0': 0.874}


In [None]:
print(f"Numero categories1 con % zeri < 0.99 tra i click o tra i non click: {index}")
print(f"Totale categories1 {len(cats1)}")
print(f"Percentuale: {round(index / len(cats1), 3) * 100}%")

Numero categories1 con % zeri < 0.99 tra i click o tra i non click: 16
Totale categories1 26
Percentuale: 61.5%


## Admants

### Normalizzazione

In [None]:
adm = re.compile('admants+.*', re.IGNORECASE)
adms = []

for v in df.columns:
  if adm.search(v):
    adms.append(v)
  
# Calcolo somma
def somma_adms(riga):
  sum = 0
  for cat in adms:
    sum += riga[cat]
  return(round(sum, 1))

df['somma'] = df.apply(somma_adms, axis=1)

In [None]:
# controllo quanti nan ci sono nelle somme: non ci sono nan

df[df['somma'].isnull()][['ad_form_id', 'somma']]

Unnamed: 0,ad_form_id,somma


In [None]:
# controllo se ci sono nan nelle categorie singole: non ci sono

for v in adms:
  nn = len(df[df[v].isnull()])
  if (nn != 0):
    print(v, nn)

In [None]:
# controllo quanti inf ci sono nelle somme: non ci sono inf
df[df['somma'] == np.inf][['ad_form_id', 'somma']]

Unnamed: 0,ad_form_id,somma


In [None]:
# controllo gli zero: ce ne sono
df[df['somma'] == 0][['ad_form_id', 'somma']]

Unnamed: 0,ad_form_id,somma
9,1520530489999888701,0.0
130,423758730060024042,0.0
176,2949209777750458222,0.0
236,8591797067637818061,0.0
248,8220246112129877727,0.0
...,...,...
82250,806966730273731052,0.0
82254,7207700329399635762,0.0
82413,5081029587588027566,0.0
82431,5407638255918714062,0.0


Guardiamo come sono i click e se ce ne sono di superiori a zero.

In [None]:
df[(df['somma'] == 0) & (df['clicks'] != 0)][['ad_form_id', 'clicks', 'somma']]

Unnamed: 0,ad_form_id,clicks,somma
10459,4441976255661567592,1,0.0
16992,1269509116987569385,1,0.0
22712,824547121173218951,1,0.0
28247,4103523210449571236,1,0.0
38850,1119000281687526928,1,0.0
40988,2813289715831986827,1,0.0
47282,828253886724037402,1,0.0
49613,8703457363297317795,1,0.0
57979,4802750125375816531,1,0.0
71105,5019332339658836935,1,0.0


Per evitare di avere normalizzazioni a NaN: poniamo gli zero nelle somme uguali a un valore arbitrario in modo che la divisione non dia NaN.

In [None]:
soglia = 10

def somma_soglia(riga):
  if riga['somma'] == 0:
    return soglia
  else:
    return riga['somma']
  
df['somma'] = df.apply(somma_soglia, axis=1)

In [None]:
# Normalizzo le somme a 100
for cat in adms:
  df[cat] = round((df[cat] / df['somma']) * 100, 2)

In [None]:
df[adms]

Unnamed: 0,admants1_appliances,admants1_artandentertainment,admants1_automotive,admants1_boatinsurance,admants1_businessandindustrialsupplies,admants1_careers,admants1_carinsurance,admants1_consumerelectronics,admants1_dating,admants1_education,...,admants1_smartphones,admants1_software,admants1_sportinggoods,admants1_sportstickets,admants1_streamingservices,admants1_styleandfashion,admants1_telecommunications,admants1_television,admants1_travel,admants1_videoandcomputergames
1,0.0,0.0,0.0,0.0,0.0,50.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.0,0.0,0.00
4,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.0,0.0,0.00
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,6.86,6.86,0.00,0.0,0.00,0.0,0.0,44.12
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,9.78,9.78,18.90,0.0,18.90,0.0,0.0,20.20
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,50.00,0.0,50.00,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,14.71,0.0,14.71,0.0,0.0,39.22
82560,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.0,0.0,0.00
82561,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,0.00,0.00,0.00,0.0,0.00,0.0,0.0,0.00
82562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,...,0.0,0.0,1.63,1.63,16.22,0.0,16.22,0.0,0.0,34.82


In [None]:
df.drop('somma', axis=1, inplace=True)

### Correlazione

Guardiamo ora la correlazione con le categories 1 in modo da decretare la fine delle une o delle altre (prof. diceva che le admants sono ottenute in modo automatizzato).

Utilizzo lo stesso procedimento usato per valutare la correlazione tra le categories.

In [None]:
# Punto 1
c1a = []
for adm in adms:
  for cat1 in cats1:
    if len(df[df[cat1] != 0]) > 0: # a causare i nan sono le categories con soli 0
      diz = {}
      diz['Adm'] = adm
      diz['C1'] = cat1
      diz['Corr'] = np.corrcoef(df[adm], df[cat1])[0][1]
      c1a.append(diz)

In [None]:
# Punto 2
df1a = pd.DataFrame(c1a)
df1a.head()

Unnamed: 0,Adm,C1,Corr
0,admants1_appliances,categories1_artandentertainment,-0.013779
1,admants1_appliances,categories1_automotive,0.001934
2,admants1_appliances,categories1_business,0.007368
3,admants1_appliances,categories1_careers,-0.000595
4,admants1_appliances,categories1_education,0.000739


In [None]:
df1a[df1a['Corr'].isnull()]

Unnamed: 0,Adm,C1,Corr


In [None]:
# Punto 3
c1a = []
for adm in adms:
    diz = {}
    diz['Adm'] = adm
    diz['C1'] = list(df1a[df1a['Adm'] == adm].sort_values('Corr', ascending=False)['C1'])[0]
    diz['Corr'] = list(df1a[df1a['Adm'] == adm].sort_values('Corr', ascending=False)['Corr'])[0]
    c1a.append(diz)

In [None]:
# Punto 4
index = 0 # uso per valutare le correlazioni significative
for diz in c1a:
    print(f"{diz['Adm']} - {diz['C1']} -> {diz['Corr']}")
    if diz['Corr'] > soglia_corr:
        index += 1
print(f"Totale: {len(c1a)}\nCorrelazioni significative: {index}\nCorrelazioni non significative: {len(c1a) - index}")

admants1_appliances - categories1_homeandgarden -> 0.3089703219950362
admants1_artandentertainment - categories1_artandentertainment -> 0.9601054396165085
admants1_automotive - categories1_automotive -> 0.9674982108185719
admants1_boatinsurance - categories1_travel -> 0.0935603383022347
admants1_businessandindustrialsupplies - categories1_business -> 0.9243813891677293
admants1_careers - categories1_careers -> 0.9581383405049315
admants1_carinsurance - categories1_automotive -> 0.9778615792431734
admants1_consumerelectronics - categories1_technologyandcomputing -> 0.4218089376742215
admants1_dating - categories1_uncategorized -> 0.25297187908516106
admants1_education - categories1_education -> 0.6997827928727245
admants1_energy - categories1_business -> 0.2676131023893167
admants1_familyandparenting - categories1_familyandparenting -> 0.8862625653217836
admants1_finance - categories1_finance -> 0.9837007627859321
admants1_foodanddrink - categories1_foodanddrink -> 0.9689048237610256
ad

La maggior parte degli admants ha una correlazione superiore a 0.7 con le categories1.

## Sentiments

In [None]:
sents = ['sentiments1_neutroneutralsentiment',
         'sentiments1_sentimentnegativesentiment',
         'sentiments1_sentimentneutralsentiment',
         'sentiments1_sentimentpositivesentiment']

In [None]:
len(df[df[sents[0]] != 0][sents])

0

In [None]:
len(df[df[sents[1]] != 0][sents])

0

In [None]:
len(df[df[sents[2]] != 0][sents])

0

In [None]:
len(df[df[sents[3]] != 0][sents])

0

Le colonne sono di soli 0. Le elimineremo.

## Feelings

### Normalizzazione

In [None]:
fel = re.compile('feelings+.*', re.IGNORECASE)
feel = []

for v in df.columns:
  if fel.search(v):
    feel.append(v)
  
# Calcolo somma
def somma_fel(riga):
  sum = 0
  for cat in feel:
    sum += riga[cat]
  return(round(sum, 1))

df['somma'] = df.apply(somma_fel, axis=1)

In [None]:
# controllo quanti nan ci sono nelle somme: non ci sono nan

df[df['somma'].isnull()][['ad_form_id', 'somma']]

Unnamed: 0,ad_form_id,somma


In [None]:
# controllo se ci sono nan nelle categorie singole: non ci sono

for v in feel:
  nn = len(df[df[v].isnull()])
  if (nn != 0):
    print(v, nn)

In [None]:
# controllo quanti inf ci sono nelle somme: non ci sono inf
df[df['somma'] == np.inf][['ad_form_id', 'somma']]

Unnamed: 0,ad_form_id,somma


In [None]:
# controllo gli zero: ce ne sono
df[df['somma'] == 0][['ad_form_id', 'somma']]

Unnamed: 0,ad_form_id,somma
1,8408205703932483258,0.0
4,6157419832685345406,0.0
7,6272249560265519045,0.0
8,2248420359615433360,0.0
9,1520530489999888701,0.0
...,...,...
82557,2460917516619769780,0.0
82558,2745741610031721805,0.0
82560,6149323461207714452,0.0
82561,7551278096513860694,0.0


Guardiamo come sono i click e se ce ne sono di superiori a zero.

In [None]:
df[(df['somma'] == 0) & (df['clicks'] != 0)][['ad_form_id', 'clicks', 'somma']]

Unnamed: 0,ad_form_id,clicks,somma


Per evitare di avere normalizzazioni a NaN: poniamo gli zero nelle somme uguali a un valore arbitrario in modo che la divisione non dia NaN.

In [None]:
soglia = 10

def somma_soglia(riga):
  if riga['somma'] == 0:
    return soglia
  else:
    return riga['somma']
  
df['somma'] = df.apply(somma_soglia, axis=1)

In [None]:
# Normalizzo le somme a 100
for cat in feel:
  df[cat] = round((df[cat] / df['somma']) * 100, 2)

In [None]:
df[feel]

Unnamed: 0,feelings1_active,feelings1_affection,feelings1_amusement,feelings1_anger,feelings1_anguish,feelings1_anxiety,feelings1_behaviour,feelings1_belonging,feelings1_boredom,feelings1_calm,...,feelings1_surprise,feelings1_sympathy,feelings1_thirst,feelings1_thoughtful,feelings1_torment,feelings1_traditionalist,feelings1_trust,feelings1_uncategorized,feelings1_violence,feelings1_wealth
1,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,56.25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82559,0.0,0.0,56.10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82560,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82561,0.0,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
82562,0.0,0.0,56.17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
df.drop('somma', axis=1, inplace=True)

### Correlazione

Guardo la correlazione con il metodo descritto sopra.

In [None]:
# Punto 1
c1f = []
for fel in feel:
  for cat1 in cats1:
    if len(df[df[cat1] != 0]) > 0: # rimuovo le categories con soli 0 perché causano alcuni nan
      diz = {}
      diz['Feel'] = fel
      diz['C1'] = cat1
      diz['Corr'] = np.corrcoef(df[fel], df[cat1])[0][1]
      c1f.append(diz)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [None]:
# Punto 2
df1f = pd.DataFrame(c1f)
df1f.head()

Unnamed: 0,Feel,C1,Corr
0,feelings1_active,categories1_artandentertainment,-0.011484
1,feelings1_active,categories1_automotive,0.01463
2,feelings1_active,categories1_business,0.027578
3,feelings1_active,categories1_careers,-0.003292
4,feelings1_active,categories1_education,0.001013


In [None]:
# Punto 3
c1f = []
for fel in feel:
    diz = {}
    diz['Feel'] = fel
    diz['C1'] = list(df1f[df1f['Feel'] == fel].sort_values('Corr', ascending=False)['C1'])[0]
    diz['Corr'] = list(df1f[df1f['Feel'] == fel].sort_values('Corr', ascending=False)['Corr'])[0]
    c1f.append(diz)

In [None]:
# Punto 4
index = 0 # uso per valutare le correlazioni significative
for diz in c1f:
    print(f"{diz['Feel']} - {diz['C1']} -> {diz['Corr']}")
    if diz['Corr'] > soglia_corr:
        index += 1
print(f"Totale: {len(c1f)}\nCorrelazioni significative: {index}\nCorrelazioni non significative: {len(c1f) - index}")

feelings1_active - categories1_healthandfitness -> 0.03888944053558872
feelings1_affection - categories1_society -> 0.04743394579749098
feelings1_amusement - categories1_hobbiesandinterests -> 0.6282348529593874
feelings1_anger - categories1_sports -> 0.02286649868075084
feelings1_anguish - categories1_artandentertainment -> nan
feelings1_anxiety - categories1_healthandfitness -> 0.06444072242987557
feelings1_behaviour - categories1_foodanddrink -> 0.18293367113937375
feelings1_belonging - categories1_lawgovtandpolitics -> 0.04293282684010049
feelings1_boredom - categories1_hobbiesandinterests -> 0.02381335758085538
feelings1_calm - categories1_homeandgarden -> 0.05576615132058282
feelings1_categoriauncategorized - categories1_artandentertainment -> nan
feelings1_citylover - categories1_artandentertainment -> nan
feelings1_committed - categories1_lawgovtandpolitics -> 0.08931021972403169
feelings1_compassionate - categories1_lawgovtandpolitics -> 0.01486747802742202
feelings1_conformis

Conclusione: i feelings non sono per niente correlati con le categories.

Guardo quanti feelings hanno solo valori pari a 0.

In [None]:
feel_0 = 0
for fel in feel:
  if len(df[df[fel] != 0]) == 0:
    feel_0 += 1

print(f"Feelings tutti nulli: {feel_0} / {len(feel)} ({round(feel_0 / len(feel),2)}%)")

Feelings tutti nulli: 19 / 109 (0.17%)


### Distribuzione

In [None]:
dist_f = [] # vettore di dizionari in cui salvo il p-value del test di student
conteggio_distr_identiche_f = 0
for cat in feel:
    diz = {}

    _, p = scipy.stats.mannwhitneyu(df_y[cat], df_n[cat])

    diz['Feelings'] = cat
    diz['p-value'] = p

    if diz['p-value'] > alfa:
        diz['Esito'] = 'Distribuzione identica'
        conteggio_distr_identiche_f+= 1
    
    else:
        diz['Esito'] = 'Distribuzione diversa'
    
    dist_f.append(diz)

In [None]:
print(f"Frazione di Feelings con distribuzione identica tra click e non click: {round(conteggio_distr_identiche_f / len(feel), 2)}.")

Frazione di Feelings con distribuzione identica tra click e non click: 0.95.


#### Analisi della frazione di 0

###### Click e non click separati

In [None]:
zeri_cnc = []
for fel in feel:
  diz = {}
  zeri_y = len(df_y[df_y[fel] == 0])
  zeri_n = len(df_n[df_n[fel] == 0])

  diz['Feeling'] = fel
  diz['% 0 tra i click'] = round(zeri_y / len(df_y), 3)
  diz['% 0 tra i non click'] = round(zeri_n / len(df_n), 3)

  zeri_cnc.append(diz)

In [None]:
for diz in zeri_cnc:
  print(diz)

{'Feeling': 'feelings1_active', '% 0 tra i click': 1.0, '% 0 tra i non click': 0.998}
{'Feeling': 'feelings1_affection', '% 0 tra i click': 1.0, '% 0 tra i non click': 1.0}
{'Feeling': 'feelings1_amusement', '% 0 tra i click': 0.595, '% 0 tra i non click': 0.771}
{'Feeling': 'feelings1_anger', '% 0 tra i click': 1.0, '% 0 tra i non click': 1.0}
{'Feeling': 'feelings1_anguish', '% 0 tra i click': 1.0, '% 0 tra i non click': 1.0}
{'Feeling': 'feelings1_anxiety', '% 0 tra i click': 1.0, '% 0 tra i non click': 0.999}
{'Feeling': 'feelings1_behaviour', '% 0 tra i click': 0.986, '% 0 tra i non click': 0.98}
{'Feeling': 'feelings1_belonging', '% 0 tra i click': 1.0, '% 0 tra i non click': 1.0}
{'Feeling': 'feelings1_boredom', '% 0 tra i click': 1.0, '% 0 tra i non click': 0.999}
{'Feeling': 'feelings1_calm', '% 0 tra i click': 1.0, '% 0 tra i non click': 1.0}
{'Feeling': 'feelings1_categoriauncategorized', '% 0 tra i click': 1.0, '% 0 tra i non click': 1.0}
{'Feeling': 'feelings1_citylover', 

Verifico se ci sono colonne con % di 0 minore di 0.99.

In [None]:
for diz in zeri_cnc:
  if diz['% 0 tra i click'] < 0.99 or diz['% 0 tra i non click'] < 0.99:
    print(diz)

{'Feeling': 'feelings1_amusement', '% 0 tra i click': 0.595, '% 0 tra i non click': 0.771}
{'Feeling': 'feelings1_behaviour', '% 0 tra i click': 0.986, '% 0 tra i non click': 0.98}
{'Feeling': 'feelings1_emotions', '% 0 tra i click': 0.562, '% 0 tra i non click': 0.717}
{'Feeling': 'feelings1_motivations', '% 0 tra i click': 0.971, '% 0 tra i non click': 0.979}


##### Click e non click uniti

In [None]:
zeri_tot = []
for fel in feel:
  diz = {}
  zeri = len(df[df[fel] == 0])

  diz['Feeling'] = fel
  diz['% 0'] = round(zeri / len(df), 3)

  zeri_tot.append(diz)

In [None]:
for diz in zeri_tot:
  print(diz)

{'Feeling': 'feelings1_active', '% 0': 0.998}
{'Feeling': 'feelings1_affection', '% 0': 1.0}
{'Feeling': 'feelings1_amusement', '% 0': 0.77}
{'Feeling': 'feelings1_anger', '% 0': 1.0}
{'Feeling': 'feelings1_anguish', '% 0': 1.0}
{'Feeling': 'feelings1_anxiety', '% 0': 0.999}
{'Feeling': 'feelings1_behaviour', '% 0': 0.98}
{'Feeling': 'feelings1_belonging', '% 0': 1.0}
{'Feeling': 'feelings1_boredom', '% 0': 0.999}
{'Feeling': 'feelings1_calm', '% 0': 1.0}
{'Feeling': 'feelings1_categoriauncategorized', '% 0': 1.0}
{'Feeling': 'feelings1_citylover', '% 0': 1.0}
{'Feeling': 'feelings1_committed', '% 0': 1.0}
{'Feeling': 'feelings1_compassionate', '% 0': 1.0}
{'Feeling': 'feelings1_conformist', '% 0': 1.0}
{'Feeling': 'feelings1_confusion', '% 0': 0.999}
{'Feeling': 'feelings1_courage', '% 0': 0.998}
{'Feeling': 'feelings1_curiosity', '% 0': 0.997}
{'Feeling': 'feelings1_desire', '% 0': 0.997}
{'Feeling': 'feelings1_disappointment', '% 0': 1.0}
{'Feeling': 'feelings1_disgust', '% 0': 1.0}

Verifico se ci sono colonne con % di 0 minore di 0.99.

In [None]:
for diz in zeri_tot:
  if diz['% 0'] < 0.99:
    print(diz)

{'Feeling': 'feelings1_amusement', '% 0': 0.77}
{'Feeling': 'feelings1_behaviour', '% 0': 0.98}
{'Feeling': 'feelings1_emotions', '% 0': 0.716}
{'Feeling': 'feelings1_motivations', '% 0': 0.979}


##### Distribuzione click e non click nelle 4 colonne notevoli

In [None]:
for diz in dist_f:
  if diz['Feelings'] in ['feelings1_amusement','feelings1_behaviour','feelings1_emotions','feelings1_motivations']:
    print(diz)

{'Feelings': 'feelings1_amusement', 'p-value': 1.0023535584054976e-09, 'Esito': 'Distribuzione diversa'}
{'Feelings': 'feelings1_behaviour', 'p-value': 0.5592511599763439, 'Esito': 'Distribuzione identica'}
{'Feelings': 'feelings1_emotions', 'p-value': 7.227752234401448e-08, 'Esito': 'Distribuzione diversa'}
{'Feelings': 'feelings1_motivations', 'p-value': 0.4752636801336805, 'Esito': 'Distribuzione identica'}


##### feelings_others

In [None]:
def feeling_others(riga):
  sum = 0
  for fel in feel:
    if fel not in ['feelings1_amusement','feelings1_emotions']:
      sum += riga[fel]
  
  return sum

df['feelings_others'] = df.apply(feeling_others, axis=1)

In [None]:
df[['feelings_others','feelings1_amusement','feelings1_emotions']]

Unnamed: 0,feelings_others,feelings1_amusement,feelings1_emotions
1,0.0,0.00,0.00
4,0.0,0.00,0.00
5,0.0,56.25,43.75
7,0.0,0.00,0.00
8,0.0,0.00,0.00
...,...,...,...
82559,0.0,56.10,43.90
82560,0.0,0.00,0.00
82561,0.0,0.00,0.00
82562,0.0,56.17,43.96


In [None]:
df[(df['feelings_others'] == 0) & (df['feelings1_amusement'] == 0) & (df['feelings1_emotions'] == 0)][['feelings_others','feelings1_amusement','feelings1_emotions']]

Unnamed: 0,feelings_others,feelings1_amusement,feelings1_emotions
1,0.0,0.0,0.0
4,0.0,0.0,0.0
7,0.0,0.0,0.0
8,0.0,0.0,0.0
9,0.0,0.0,0.0
...,...,...,...
82557,0.0,0.0,0.0
82558,0.0,0.0,0.0
82560,0.0,0.0,0.0
82561,0.0,0.0,0.0


Ci sono righe che hanno entrate pari a 0 in tutte e 3 le colonne e questo non dovrebbe essere possibile. Guardo se ci sono entrate con others diverso da 0.

In [None]:
df[df['feelings_others'] > 0][['feelings_others','feelings1_amusement','feelings1_emotions']]

Unnamed: 0,feelings_others,feelings1_amusement,feelings1_emotions
16,58.33,0.00,41.67
26,48.78,29.27,21.95
29,69.43,0.00,30.63
40,69.49,0.00,30.66
42,5.61,50.49,43.90
...,...,...,...
82484,76.92,0.00,23.08
82492,55.56,0.00,44.44
82493,48.78,29.27,21.95
82498,48.84,29.30,21.98


Ci sono entrate con others diverso da 0.

La somma non è proprio 100, ma credo sia per via dei round applicati in fase di normalizzazione. Non faccio ulteriori controlli perché il fatto che ci sono più di 44000 righe con tutti i feelings pari a 0, è una buona ragione per eliminare tutti i feelings.