# Create CSV file for Data Visualization in PowerBI

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

In [2]:
# create directory to save file for dashboards data visualization
os.makedirs('df_dashboard', exist_ok=True)

In [3]:
# read final dataframe from second notebook
df = pd.read_csv(r'data_clean/04_df_analysis.csv')

In [4]:
print(df.info())
df.head(3)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 641480 entries, 0 to 641479
Data columns (total 34 columns):
 #   Column                         Non-Null Count   Dtype  
---  ------                         --------------   -----  
 0   User ID                        641480 non-null  int64  
 1   Variable                       641480 non-null  object 
 2   Question                       641480 non-null  object 
 3   Topic                          625240 non-null  object 
 4   Answer Label                   641480 non-null  object 
 5   Answer Value                   641480 non-null  int64  
 6   Age                            641480 non-null  int64  
 7   Highest Education              641480 non-null  object 
 8   Seconds to Answer              634607 non-null  float64
 9   Sex                            641480 non-null  object 
 10  User Social Weight             641480 non-null  float64
 11  Pet Friends                    591552 non-null  object 
 12  Num. of Children (0-14 years) 

Unnamed: 0,User ID,Variable,Question,Topic,Answer Label,Answer Value,Age,Highest Education,Seconds to Answer,Sex,...,Event month,Municipality,City Name,Region,Region Type,Latitude,Longitude,Altitude Profile,Geo Position,Urbanization Level
0,221250719,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A lot,5,20,High school diploma,10.84925,Male,...,February,Bitonto,Bari,Puglia,Ordinary Statute,41.109964,16.694313,Inner Hill,Hinterland,City / Highly populated
1,89508748,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,21,High school diploma,12.965,Female,...,February,Bocchigliero,Cosenza,Calabria,Ordinary Statute,39.417678,16.756269,Inner Mountain,Hinterland,Rural / Slightly populated
2,98143773,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little,2,21,High school diploma,9.831083,Female,...,February,Porto San Giorgio,Fermo,Marche,Ordinary Statute,43.184415,13.796386,Coastal Hill,Coastline,Small city or suburb / Intermediately populated


In [5]:
# filter dataframe for analysis by only taking questions answered
df1 = df[df['Answer Value'] != 0]

In [6]:
df1 = df1.copy()
df1.columns

Index(['User ID', 'Variable', 'Question', 'Topic', 'Answer Label',
       'Answer Value', 'Age', 'Highest Education', 'Seconds to Answer', 'Sex',
       'User Social Weight', 'Pet Friends', 'Num. of Children (0-14 years)',
       'Season', 'Time Slot', 'Survey filling date', 'Survey filling hour',
       'Weekday', 'Mood relationship', 'General Mood', 'Year', 'Generation',
       'Generation Description', 'Events', 'Event month', 'Municipality',
       'City Name', 'Region', 'Region Type', 'Latitude', 'Longitude',
       'Altitude Profile', 'Geo Position', 'Urbanization Level'],
      dtype='object')

In [7]:
# Weighet Social Weight column is created (value answer from 1 to 5 multiplied for individual social weight assigned in the survey)
df1['Weighted AVG Social Weight'] = df1['User Social Weight']*df1['Answer Value']

In [8]:
df1

Unnamed: 0,User ID,Variable,Question,Topic,Answer Label,Answer Value,Age,Highest Education,Seconds to Answer,Sex,...,Municipality,City Name,Region,Region Type,Latitude,Longitude,Altitude Profile,Geo Position,Urbanization Level,Weighted AVG Social Weight
0,221250719,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A lot,5,20,High school diploma,10.849250,Male,...,Bitonto,Bari,Puglia,Ordinary Statute,41.109964,16.694313,Inner Hill,Hinterland,City / Highly populated,27.291323
1,89508748,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,21,High school diploma,12.965000,Female,...,Bocchigliero,Cosenza,Calabria,Ordinary Statute,39.417678,16.756269,Inner Mountain,Hinterland,Rural / Slightly populated,16.374794
2,98143773,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little,2,21,High school diploma,9.831083,Female,...,Porto San Giorgio,Fermo,Marche,Ordinary Statute,43.184415,13.796386,Coastal Hill,Coastline,Small city or suburb / Intermediately populated,10.916529
3,641447689,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little,2,21,High school diploma,46.290500,Female,...,Gangi,Palermo,Sicilia,Special Statute,37.798051,14.205242,Inner Mountain,Hinterland,Small city or suburb / Intermediately populated,10.916529
4,61843325,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little,2,21,High school diploma,10.960417,Female,...,Acireale,Catania,Sicilia,Special Statute,37.612047,15.166279,Coastal Hill,Coastline,City / Highly populated,10.916529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
641475,3328900137,B11_Q71,"In general, how satisfied are you about how th...",,Quite satisfied,3,68,Middle school diploma,,Female,...,Alcara li Fusi,Messina,Sicilia,Special Statute,38.213554,14.701078,Inner Mountain,Hinterland,Rural / Slightly populated,3.131875
641476,3656463002,B11_Q71,"In general, how satisfied are you about how th...",,Very satisfied,4,38,High school diploma,6.419917,Female,...,Abbiategrasso,Milano,Lombardia,Ordinary Statute,45.398580,8.917486,Plain,Hinterland,Small city or suburb / Intermediately populated,2.338433
641477,4227132945,B11_Q71,"In general, how satisfied are you about how th...",,Not very satisfied,2,60,Middle school diploma,17.142417,Male,...,Acerra,Napoli,Campania,Ordinary Statute,40.945805,14.371179,Plain,Hinterland,Small city or suburb / Intermediately populated,2.349566
641478,4232442131,B11_Q71,"In general, how satisfied are you about how th...",,Not at all satisfied,1,52,Degree or doctorate,14.098417,Female,...,Affile,Roma,Lazio,Ordinary Statute,41.884333,13.972828,Inner Mountain,Hinterland,Rural / Slightly populated,0.206523


In [9]:
df1.columns

Index(['User ID', 'Variable', 'Question', 'Topic', 'Answer Label',
       'Answer Value', 'Age', 'Highest Education', 'Seconds to Answer', 'Sex',
       'User Social Weight', 'Pet Friends', 'Num. of Children (0-14 years)',
       'Season', 'Time Slot', 'Survey filling date', 'Survey filling hour',
       'Weekday', 'Mood relationship', 'General Mood', 'Year', 'Generation',
       'Generation Description', 'Events', 'Event month', 'Municipality',
       'City Name', 'Region', 'Region Type', 'Latitude', 'Longitude',
       'Altitude Profile', 'Geo Position', 'Urbanization Level',
       'Weighted AVG Social Weight'],
      dtype='object')

In [10]:
# cast 'Survey filling date' into correct dtype
df1['Survey filling date'] = pd.to_datetime(df1['Survey filling date'], format='%d/%m/%Y')

In [11]:
# create dataframe for data visualization grouping by relevant columns to me
df_analisi = df1.groupby(['User ID', 'Variable', 'Question', 'Topic', 'Answer Label',
        'Age', 'Highest Education', 'Seconds to Answer', 'Sex',
        'Pet Friends', 'Num. of Children (0-14 years)',
       'Season', 'Time Slot', 'Survey filling date', 'Survey filling hour',
       'Weekday', 'Mood relationship', 'General Mood', 'Year', 'Generation',
       'Generation Description', 'Events', 'Event month', 'Municipality',
       'City Name', 'Region', 'Region Type', 'Latitude', 'Longitude',
       'Altitude Profile', 'Geo Position', 'Urbanization Level',
        ], dropna=False, as_index=False)['Weighted AVG Social Weight'].mean()

In [12]:
df_analisi.shape

(349160, 33)

In [13]:
# save to csv 
df_analisi.to_csv(r'df_dashboard/df_dashboard_analysis.csv', index=False)