In [98]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt

In [99]:
file_path = "data_tropicana_year110_gdp.csv"
df_gdp = pd.read_csv(file_path, delimiter=",")  

print(df_gdp.head())

   year        gdp
0     0  947610.99
1     1  931427.95
2     2  872704.64
3     3  875281.14
4     4  853371.77


In [100]:
file_path = "data_tropicana_year110.csv"
df = pd.read_csv(file_path, delimiter=",")  

df_gdp.head()

Unnamed: 0,year,gdp
0,0,947610.99
1,1,931427.95
2,2,872704.64
3,3,875281.14
4,4,853371.77


In [101]:
avg_income = df.groupby('year', as_index=False)['income'].mean()

df_gdp = df_gdp.merge(avg_income, on='year', how='right')



In [102]:
import numpy as np
import pandas as pd

# Gender Distribution Calculation
gender_distribution = df.groupby(['year', 'sex']).size().unstack(fill_value=0)
gender_distribution['total'] = gender_distribution.sum(axis=1)
gender_distribution['female_percentage'] = gender_distribution['F'] / gender_distribution['total']

# Employment Rate Calculation
non_employed_professions = ['child', 'unemployed', 'homemaker', 'retired']
df['employed'] = ~df['profession'].isin(non_employed_professions)
employment_rate = df.groupby('year')['employed'].mean()  # Convert to percentage





In [103]:
# Gini Coefficient Calculation
def gini_coefficient(income_list):
    """Computes the Gini coefficient of a list of incomes."""
    incomes = np.sort(income_list)  # Sort incomes in ascending order
    n = len(incomes)
    if n == 0:
        return np.nan  # Avoid division by zero
    cumulative_income_sum = np.sum((np.arange(1, n + 1) * incomes))
    total_income_sum = np.sum(incomes)
    
    gini = (2 * cumulative_income_sum) / (n * total_income_sum) - (n + 1) / n
    return gini

gini = df.groupby('year')['income'].apply(gini_coefficient)

gini.head()

year
0    0.604055
1    0.607001
2    0.634642
3    0.613444
4    0.625739
Name: income, dtype: float64

In [104]:

# Merge
df_gdp = df_gdp.merge(employment_rate, on='year', how='left')

df_gdp = df_gdp.merge(gender_distribution, on='year', how='left')

df_gdp = df_gdp.merge(gini, on='year', how='right')

df_gdp.head()


Unnamed: 0,year,gdp,income_x,employed,F,M,X,total,female_percentage,income_y
0,0,947610.99,1895.22198,0.516,234,244,22,500,0.468,0.604055
1,1,931427.95,1840.7667,0.507905,237,247,22,506,0.468379,0.607001
2,2,872704.64,1711.185569,0.498039,239,248,23,510,0.468627,0.634642
3,3,875281.14,1709.533477,0.492188,239,250,23,512,0.466797,0.613444
4,4,853371.77,1663.492729,0.483431,241,249,23,513,0.469786,0.625739


In [105]:
df_gdp.rename(columns={'income': 'gini'}, inplace=True)
df_gdp = df_gdp.drop(columns=['F', 'M','X'])
df_gdp.head()

Unnamed: 0,year,gdp,income_x,employed,total,female_percentage,income_y
0,0,947610.99,1895.22198,0.516,500,0.468,0.604055
1,1,931427.95,1840.7667,0.507905,506,0.468379,0.607001
2,2,872704.64,1711.185569,0.498039,510,0.468627,0.634642
3,3,875281.14,1709.533477,0.492188,512,0.466797,0.613444
4,4,853371.77,1663.492729,0.483431,513,0.469786,0.625739


In [106]:
#  Add column for average age per year
average_age_per_year = df.groupby('year')['age'].mean().reset_index(name='average_age')

# Merge average age per year into the original dataframe
df_gdp = df_gdp.merge(average_age_per_year, on='year', how='left')


In [107]:
df_gdp.to_csv('day_3.csv')

In [108]:

# Get the last 5 years of data
last_5_years = df_gdp['year'].max() - 4
filtered_df = df_gdp[df_gdp['year'] >= last_5_years]


# Get GDP, Gini, employment rate, and average age for the last 5 years
economic_indicators = filtered_df.groupby('year').agg({
    'gdp': 'mean',
    'gini': 'mean',
    'employed': 'mean',
    'average_age': 'mean'
}).reset_index()



print("\nEconomic indicators for the last 5 years:")
print(economic_indicators)


KeyError: "Column(s) ['gini'] do not exist"