# Mini-Project 2a: Reading Data

In [74]:
# python version 3.12.5
import pandas as pd
import altair as alt
from collections import defaultdict
from typing import DefaultDict, Any

In [75]:
# data_folder_path = input("Data folder path? (defaults to: ./): ") or "./"
# print(data_folder_path)
data_folder_path = "./data/"

## `happiness_report.csv`

In [76]:
# requires pd>=1.5.0, as specified in docs: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html

# python >=3.12.0 for most type features, although i recall early support in 3.11?. 
dtype_dict: DefaultDict[str, Any] = defaultdict(lambda: float, country="str")

happiness_report_regular = pd.read_csv( # type: ignore
  f"{data_folder_path}happiness_report.csv", 
  dtype = dtype_dict,
)
happiness_report_regular.info()
happiness_report_regular

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          155 non-null    object 
 1   happiness_score  155 non-null    float64
 2   GDP_per_capita   155 non-null    float64
 3   life_expectancy  155 non-null    float64
 4   freedom          155 non-null    float64
dtypes: float64(4), object(1)
memory usage: 6.2+ KB


Unnamed: 0,country,happiness_score,GDP_per_capita,life_expectancy,freedom
0,Norway,7.537,1.616463,0.796667,0.635423
1,Denmark,7.522,1.482383,0.792566,0.626007
2,Iceland,7.504,1.480633,0.833552,0.627163
3,Switzerland,7.494,1.564980,0.858131,0.620071
4,Finland,7.469,1.443572,0.809158,0.617951
...,...,...,...,...,...
150,Rwanda,3.471,0.368746,0.326425,0.581844
151,Syria,3.462,0.777153,0.500533,0.081539
152,Tanzania,3.349,0.511136,0.364509,0.390018
153,Burundi,2.905,0.091623,0.151611,0.059901


## `happiness_report_semicolon.csv`

In [77]:
hr_semicolon_csv = pd.read_csv(
  f"{data_folder_path}happiness_report_semicolon.csv", 
  sep=";",
)
hr_semicolon_csv.info()
hr_semicolon_csv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   country          155 non-null    object
 1   happiness_score  155 non-null    object
 2   GDP_per_capita   155 non-null    object
 3   life_expectancy  155 non-null    object
 4   freedom          155 non-null    object
dtypes: object(5)
memory usage: 6.2+ KB


Unnamed: 0,country,happiness_score,GDP_per_capita,life_expectancy,freedom
0,Norway,753700017929077,161646318435669,0796666502952576,0635422587394714
1,Denmark,752199983596802,148238301277161,0792565524578094,0626006722450256
2,Iceland,750400018692017,1480633020401,0833552122116089,0627162635326385
3,Switzerland,749399995803833,156497955322266,0858131289482117,0620070576667786
4,Finland,74689998626709,144357192516327,080915766954422,0617950856685638
...,...,...,...,...,...
150,Rwanda,347099995613098,0368745893239975,0326424807310104,0581843852996826
151,Syria,346199989318848,0777153134346008,050053334236145,00815394446253777
152,Tanzania,334899997711182,0511135876178741,0364509284496307,0390017777681351
153,Burundi,290499997138977,0091622568666935,0151610791683197,00599007532000542


In [78]:
# cleaning data, not instructed, but did anyways
# idk if modifying in place is best practice, but w/e
# alternatively we could've specified the converter parameter as well,
# we'd have to read the headers and then pass non country to this function
for col in hr_semicolon_csv.columns:
  if col != 'country':
    try:
      hr_semicolon_csv[col] = hr_semicolon_csv[col].astype(float)
    except (ValueError, TypeError):
      try:
        hr_semicolon_csv[col] = hr_semicolon_csv[col].str.replace(",", ".").astype(float)
      except:
        pass
hr_semicolon_csv.info()
hr_semicolon_csv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          155 non-null    object 
 1   happiness_score  155 non-null    float64
 2   GDP_per_capita   155 non-null    float64
 3   life_expectancy  155 non-null    float64
 4   freedom          155 non-null    float64
dtypes: float64(4), object(1)
memory usage: 6.2+ KB


Unnamed: 0,country,happiness_score,GDP_per_capita,life_expectancy,freedom
0,Norway,7.537,1.616463,0.796667,0.635423
1,Denmark,7.522,1.482383,0.792566,0.626007
2,Iceland,7.504,1.480633,0.833552,0.627163
3,Switzerland,7.494,1.564980,0.858131,0.620071
4,Finland,7.469,1.443572,0.809158,0.617951
...,...,...,...,...,...
150,Rwanda,3.471,0.368746,0.326425,0.581844
151,Syria,3.462,0.777153,0.500533,0.081539
152,Tanzania,3.349,0.511136,0.364509,0.390018
153,Burundi,2.905,0.091623,0.151611,0.059901


## `happiness_report.tsv`

In [79]:
happiness_report_tsv = pd.read_csv( # type: ignore
  f"{data_folder_path}happiness_report.tsv", 
  dtype = dtype_dict,
  sep="\t"
)
happiness_report_tsv.info()
happiness_report_tsv

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          155 non-null    object 
 1   happiness_score  155 non-null    float64
 2   GDP_per_capita   155 non-null    float64
 3   life_expectancy  155 non-null    float64
 4   freedom          155 non-null    float64
dtypes: float64(4), object(1)
memory usage: 6.2+ KB


Unnamed: 0,country,happiness_score,GDP_per_capita,life_expectancy,freedom
0,Norway,7.537,1.616463,0.796667,0.635423
1,Denmark,7.522,1.482383,0.792566,0.626007
2,Iceland,7.504,1.480633,0.833552,0.627163
3,Switzerland,7.494,1.564980,0.858131,0.620071
4,Finland,7.469,1.443572,0.809158,0.617951
...,...,...,...,...,...
150,Rwanda,3.471,0.368746,0.326425,0.581844
151,Syria,3.462,0.777153,0.500533,0.081539
152,Tanzania,3.349,0.511136,0.364509,0.390018
153,Burundi,2.905,0.091623,0.151611,0.059901


## `happiness_report_metadata.csv`

In [80]:
happiness_report_metadata = pd.read_csv( # type: ignore
  f"{data_folder_path}happiness_report_metadata.csv", 
  dtype = dtype_dict,
  skiprows=3,
)
happiness_report_metadata.info()
happiness_report_metadata

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          155 non-null    object 
 1   happiness_score  155 non-null    float64
 2   GDP_per_capita   155 non-null    float64
 3   life_expectancy  155 non-null    float64
 4   freedom          155 non-null    float64
dtypes: float64(4), object(1)
memory usage: 6.2+ KB


Unnamed: 0,country,happiness_score,GDP_per_capita,life_expectancy,freedom
0,Norway,7.537,1.616463,0.796667,0.635423
1,Denmark,7.522,1.482383,0.792566,0.626007
2,Iceland,7.504,1.480633,0.833552,0.627163
3,Switzerland,7.494,1.564980,0.858131,0.620071
4,Finland,7.469,1.443572,0.809158,0.617951
...,...,...,...,...,...
150,Rwanda,3.471,0.368746,0.326425,0.581844
151,Syria,3.462,0.777153,0.500533,0.081539
152,Tanzania,3.349,0.511136,0.364509,0.390018
153,Burundi,2.905,0.091623,0.151611,0.059901


## `happiness_report_no_header.csv`

In [81]:
happiness_report_noheader = pd.read_csv( # type: ignore
  f"{data_folder_path}happiness_report_no_header.csv", 
  dtype = dtype_dict,
  names = ["country","happiness_score","GDP_per_capita","life_expectancy","freedom"]
)
happiness_report_noheader.info()
happiness_report_noheader

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155 entries, 0 to 154
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          155 non-null    object 
 1   happiness_score  155 non-null    float64
 2   GDP_per_capita   155 non-null    float64
 3   life_expectancy  155 non-null    float64
 4   freedom          155 non-null    float64
dtypes: float64(4), object(1)
memory usage: 6.2+ KB


Unnamed: 0,country,happiness_score,GDP_per_capita,life_expectancy,freedom
0,Norway,7.537,1.616463,0.796667,0.635423
1,Denmark,7.522,1.482383,0.792566,0.626007
2,Iceland,7.504,1.480633,0.833552,0.627163
3,Switzerland,7.494,1.564980,0.858131,0.620071
4,Finland,7.469,1.443572,0.809158,0.617951
...,...,...,...,...,...
150,Rwanda,3.471,0.368746,0.326425,0.581844
151,Syria,3.462,0.777153,0.500533,0.081539
152,Tanzania,3.349,0.511136,0.364509,0.390018
153,Burundi,2.905,0.091623,0.151611,0.059901


## Visualizing `happy_header` DF using Altair's `mark_point` function

In [82]:
plot = alt.Chart(happiness_report_noheader).mark_point().encode(
  x=alt.X("GDP_per_capita").title("How much GDP contributes to Happiness Score"),
  y=alt.Y("life_expectancy").title("How much Life expectancy contributes to Happiness Score")
).configure_axis(
  labelFontSize=12,
  titleFontSize=10,
)
plot

## Visualizing `happiness_report_metadata.csv` DF using Altair's `mark_bar` function

In [83]:
plot = alt.Chart(happiness_report_metadata).mark_bar().encode(
  x=alt.X("happiness_score").title("happiness_score (binned)").bin(),
  y=alt.Y("count()").title("Count of Records")
).configure_axis(
  labelFontSize=12,
  titleFontSize=10,
)
plot