# Merge additional information to main Survey Dataframe

In this fourth notebook all details generated in notebook n.03 are merged into the main Survey Dataframe

In [1]:
# import libraries
import pandas as pd
import numpy as np

In [2]:
# read survey CSV
survey = pd.read_csv(r'data_clean/03_survey_to_merge.csv')
survey.head(3)

Unnamed: 0,User ID,Variable,Question,Topic,Answer Label,Answer Value,Age,CAP,City,Num. of Citizens,...,Num. of Children (0-14 years),Season,Month,Time Slot,birth_year,Survey filling date,Survey filling hour,Weekday,Mood relationship,General Mood
0,221250719,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A lot,5,20,70032,BA,56929.0,...,0,Winter,February,Evening,2003,14/02/2023,19:06:44,Tuesday,PANAS and Personal Life,Good
1,89508748,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,21,87060,CS,1281.0,...,0,Winter,February,Evening,2002,27/02/2023,18:36:02,Monday,PANAS and Personal Life,Good
2,98143773,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little,2,21,63822,FM,16062.0,...,0,Winter,February,Evening,2002,19/02/2023,18:39:59,Sunday,PANAS and Personal Life,Good


In [3]:
# read generation mapping, events and geographical details CSV
generation_mapping = pd.read_csv(r'data_update/generation_mapping.csv')
events = pd.read_csv(r'data_update/events.csv')
geo_details = pd.read_csv(r'data_update/geographical_details.csv')

In [4]:
generation_mapping.columns

Index(['Year', 'Generation', 'Generation Description'], dtype='object')

In [5]:
# merge survey with generation mapping containing cultural generations infos
survey = pd.merge(survey, generation_mapping, how='left', left_on='birth_year', right_on='Year')

In [6]:
print(survey.shape)
survey.head(3)

(641480, 30)


Unnamed: 0,User ID,Variable,Question,Topic,Answer Label,Answer Value,Age,CAP,City,Num. of Citizens,...,Time Slot,birth_year,Survey filling date,Survey filling hour,Weekday,Mood relationship,General Mood,Year,Generation,Generation Description
0,221250719,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A lot,5,20,70032,BA,56929.0,...,Evening,2003,14/02/2023,19:06:44,Tuesday,PANAS and Personal Life,Good,2003,Generation Z 1997 - 2012,Generation Z (or Gen Z for short and colloquia...
1,89508748,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,21,87060,CS,1281.0,...,Evening,2002,27/02/2023,18:36:02,Monday,PANAS and Personal Life,Good,2002,Generation Z 1997 - 2012,Generation Z (or Gen Z for short and colloquia...
2,98143773,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little,2,21,63822,FM,16062.0,...,Evening,2002,19/02/2023,18:39:59,Sunday,PANAS and Personal Life,Good,2002,Generation Z 1997 - 2012,Generation Z (or Gen Z for short and colloquia...


In [7]:
events.columns

Index(['Events', 'Event month'], dtype='object')

In [8]:
# merge survey with events infos
survey = pd.merge(survey, events, how='left', left_on='Month', right_on='Event month')
print(survey.shape)
survey.head(3)

(641480, 32)


Unnamed: 0,User ID,Variable,Question,Topic,Answer Label,Answer Value,Age,CAP,City,Num. of Citizens,...,Survey filling date,Survey filling hour,Weekday,Mood relationship,General Mood,Year,Generation,Generation Description,Events,Event month
0,221250719,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A lot,5,20,70032,BA,56929.0,...,14/02/2023,19:06:44,Tuesday,PANAS and Personal Life,Good,2003,Generation Z 1997 - 2012,Generation Z (or Gen Z for short and colloquia...,1 February – Lebanese liquidity crisis: The ce...,February
1,89508748,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,21,87060,CS,1281.0,...,27/02/2023,18:36:02,Monday,PANAS and Personal Life,Good,2002,Generation Z 1997 - 2012,Generation Z (or Gen Z for short and colloquia...,1 February – Lebanese liquidity crisis: The ce...,February
2,98143773,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little,2,21,63822,FM,16062.0,...,19/02/2023,18:39:59,Sunday,PANAS and Personal Life,Good,2002,Generation Z 1997 - 2012,Generation Z (or Gen Z for short and colloquia...,1 February – Lebanese liquidity crisis: The ce...,February


In [9]:
# check the column to merge of the main dataframe vs the geographical dataframe
print(survey['CAP'].info())
print(geo_details['City CAP'].info())

<class 'pandas.core.series.Series'>
Int64Index: 641480 entries, 0 to 641479
Series name: CAP
Non-Null Count   Dtype
--------------   -----
641480 non-null  int64
dtypes: int64(1)
memory usage: 9.8 MB
None
<class 'pandas.core.series.Series'>
RangeIndex: 8463 entries, 0 to 8462
Series name: City CAP
Non-Null Count  Dtype
--------------  -----
8463 non-null   int64
dtypes: int64(1)
memory usage: 66.2 KB
None


In [10]:
# change dtype to match the following merge
geo_details['City CAP'] = [int(i) for i in geo_details['City CAP']]

In [11]:
survey.columns


Index(['User ID', 'Variable', 'Question', 'Topic', 'Answer Label',
       'Answer Value', 'Age', 'CAP', 'City', 'Num. of Citizens',
       'Highest Education', 'Survey filling date and time',
       'Seconds to Answer', 'Sex', 'User Social Weight', 'idRispondente',
       'Pet Friends', 'Num. of Children (0-14 years)', 'Season', 'Month',
       'Time Slot', 'birth_year', 'Survey filling date', 'Survey filling hour',
       'Weekday', 'Mood relationship', 'General Mood', 'Year', 'Generation',
       'Generation Description', 'Events', 'Event month'],
      dtype='object')

In [12]:
# merge main dataframe with the geographical details dataframe and drop duplicated values generated from the merge
survey = survey.merge(geo_details, how='left', left_on='CAP', right_on='City CAP')
print(survey.shape)

(1711298, 43)


In [13]:
# drop duplicated rows generated from merge
survey.drop_duplicates(subset=['User ID', 'Question', 'Topic', 'Answer Label'], keep='first', inplace=True, ignore_index=True)
print(survey.shape)
survey.isnull().sum()

(641480, 43)


User ID                              0
Variable                             0
Question                             0
Topic                            16240
Answer Label                         0
Answer Value                         0
Age                                  0
CAP                                  0
City                             36814
Num. of Citizens                  4582
Highest Education                    0
Survey filling date and time         0
Seconds to Answer                 6873
Sex                                  0
User Social Weight                   0
idRispondente                    49928
Pet Friends                      49928
Num. of Children (0-14 years)    49928
Season                               0
Month                                0
Time Slot                            0
birth_year                           0
Survey filling date                  0
Survey filling hour                  0
Weekday                              0
Mood relationship        

In [14]:
# filter the remaining null values still resulting from the previous merge and assign it to a new dataframe 'null_position'
null_position = survey[survey['Geo Position'].isnull() == True]
print(null_position.shape)
null_position.head()

(19276, 43)


Unnamed: 0,User ID,Variable,Question,Topic,Answer Label,Answer Value,Age,CAP,City,Num. of Citizens,...,City CAP,sigla_provincia,City Name,Region,Region Type,Latitude,Longitude,Altitude Profile,Geo Position,Urbanization Level
14,1906221911,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,23,20165,,,...,,,,,,,,,,
49,3003851093,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,Quite a lot,4,26,100,RM,2546804.0,...,,,,,,,,,,
50,3984968717,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,26,100,RM,2546804.0,...,,,,,,,,,,
180,1236676492,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,31,10102,,,...,,,,,,,,,,
229,3851484828,B11_Q62_A_1,Please find below some words indicating differ...,Active/ Dynamic,A little yes and a little no,3,32,10107,,,...,,,,,,,,,,


In [15]:
pd.Series(null_position.columns)

0                           User ID
1                          Variable
2                          Question
3                             Topic
4                      Answer Label
5                      Answer Value
6                               Age
7                               CAP
8                              City
9                  Num. of Citizens
10                Highest Education
11     Survey filling date and time
12                Seconds to Answer
13                              Sex
14               User Social Weight
15                    idRispondente
16                      Pet Friends
17    Num. of Children (0-14 years)
18                           Season
19                            Month
20                        Time Slot
21                       birth_year
22              Survey filling date
23              Survey filling hour
24                          Weekday
25                Mood relationship
26                     General Mood
27                          

In [16]:
# drop the columns of the new 'null_position' dataframe in order to merge again the geographical details using another matching column
null_position = null_position.copy()
null_position.drop(columns= null_position.iloc[:,32:], inplace= True)
null_position.columns

Index(['User ID', 'Variable', 'Question', 'Topic', 'Answer Label',
       'Answer Value', 'Age', 'CAP', 'City', 'Num. of Citizens',
       'Highest Education', 'Survey filling date and time',
       'Seconds to Answer', 'Sex', 'User Social Weight', 'idRispondente',
       'Pet Friends', 'Num. of Children (0-14 years)', 'Season', 'Month',
       'Time Slot', 'birth_year', 'Survey filling date', 'Survey filling hour',
       'Weekday', 'Mood relationship', 'General Mood', 'Year', 'Generation',
       'Generation Description', 'Events', 'Event month'],
      dtype='object')

In [17]:
# merge null_position the geographical dataframe 
null_position = pd.merge(null_position, geo_details, how='left', left_on='City', right_on='sigla_provincia')
null_position.drop_duplicates(subset=['User ID', 'Question', 'Topic', 'Answer Label'], keep='first', inplace=True, ignore_index=True)

In [18]:
print(null_position.shape)
pd.Series(null_position.columns)

(19276, 43)


0                           User ID
1                          Variable
2                          Question
3                             Topic
4                      Answer Label
5                      Answer Value
6                               Age
7                               CAP
8                              City
9                  Num. of Citizens
10                Highest Education
11     Survey filling date and time
12                Seconds to Answer
13                              Sex
14               User Social Weight
15                    idRispondente
16                      Pet Friends
17    Num. of Children (0-14 years)
18                           Season
19                            Month
20                        Time Slot
21                       birth_year
22              Survey filling date
23              Survey filling hour
24                          Weekday
25                Mood relationship
26                     General Mood
27                          

In [19]:
# create new dataframe 'not_null_position' filtering not null values from the first merge of the main dataframe with the geographical dataframe
not_null_position = survey[survey['Geo Position'].isnull() == False]
print(not_null_position.shape)
pd.Series(not_null_position.columns)

(622204, 43)


0                           User ID
1                          Variable
2                          Question
3                             Topic
4                      Answer Label
5                      Answer Value
6                               Age
7                               CAP
8                              City
9                  Num. of Citizens
10                Highest Education
11     Survey filling date and time
12                Seconds to Answer
13                              Sex
14               User Social Weight
15                    idRispondente
16                      Pet Friends
17    Num. of Children (0-14 years)
18                           Season
19                            Month
20                        Time Slot
21                       birth_year
22              Survey filling date
23              Survey filling hour
24                          Weekday
25                Mood relationship
26                     General Mood
27                          

In [20]:
# concat the dataframes 'null_position' and 'not_null_position'. Null values have decreased. Final dataframe is the result
survey_final = pd.concat([not_null_position, null_position], ignore_index=True)
print(survey_final.shape)
survey_final.isnull().sum()

(641480, 43)


User ID                              0
Variable                             0
Question                             0
Topic                            16240
Answer Label                         0
Answer Value                         0
Age                                  0
CAP                                  0
City                             36814
Num. of Citizens                  4582
Highest Education                    0
Survey filling date and time         0
Seconds to Answer                 6873
Sex                                  0
User Social Weight                   0
idRispondente                    49928
Pet Friends                      49928
Num. of Children (0-14 years)    49928
Season                               0
Month                                0
Time Slot                            0
birth_year                           0
Survey filling date                  0
Survey filling hour                  0
Weekday                              0
Mood relationship        

In [21]:
survey_final.columns

Index(['User ID', 'Variable', 'Question', 'Topic', 'Answer Label',
       'Answer Value', 'Age', 'CAP', 'City', 'Num. of Citizens',
       'Highest Education', 'Survey filling date and time',
       'Seconds to Answer', 'Sex', 'User Social Weight', 'idRispondente',
       'Pet Friends', 'Num. of Children (0-14 years)', 'Season', 'Month',
       'Time Slot', 'birth_year', 'Survey filling date', 'Survey filling hour',
       'Weekday', 'Mood relationship', 'General Mood', 'Year', 'Generation',
       'Generation Description', 'Events', 'Event month', 'Municipality',
       'City CAP', 'sigla_provincia', 'City Name', 'Region', 'Region Type',
       'Latitude', 'Longitude', 'Altitude Profile', 'Geo Position',
       'Urbanization Level'],
      dtype='object')

In [22]:
# drop irrelevant columns from final dataframe
survey_final.drop(columns=['CAP', 'City', 'Num. of Citizens', 'Survey filling date and time', 
                             'idRispondente', 'Month', 'birth_year', 'City CAP', 'sigla_provincia'], inplace=True)

In [23]:
# save final dataframe to csv
survey_final.to_csv(r'data_clean/04_df_analysis.csv', index=False)