In [1]:
# !pip install scikit-learn
# !pip install plotly
# !pip install censusdata

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df_fertility = pd.read_csv('/content/drive/My Drive/STATE_OF_THE_STATES/clean_fertilityrates.csv')
df_income = pd.read_csv('/content/drive/My Drive/STATE_OF_THE_STATES/clean_incomerate.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/STATE_OF_THE_STATES/clean_fertilityrates.csv'

In [None]:
# df_income.drop(df_income.columns[12], axis=1, inplace=True)
# df_income.head()

In [None]:
df_income.columns


In [None]:

# Assuming df_income and df_fertility are your two datasets

# Aligning the years in both datasets for the years 2010 to 2019 and 2021
years_common = [str(year) for year in range(2010, 2020)] + ['2021']
df_income_aligned = df_income[['State'] + years_common]
df_fertility_aligned = df_fertility[['State'] + years_common]

# Renaming the columns in the fertility dataset for clarity
df_fertility_aligned.columns = ['State'] + [f'FertilityRate_{year}' for year in years_common]

# Merge the datasets on the 'State' column
merged_df = pd.merge(df_income_aligned, df_fertility_aligned, on='State')


In [None]:
descriptive_stats = merged_df.describe()
descriptive_stats

In [None]:
merged_df.info()

In [None]:
import matplotlib.pyplot as plt

# Initialize the figure with the desired size before plotting
plt.figure(figsize=(15, 6))  # Set the size of the plot

# Plotting income trend
for year in years_common:
    plt.plot(merged_df['State'], merged_df[year], label=f'Income {year}')

plt.xlabel('State')
plt.ylabel('Income')
plt.title('Income Trend Over Years')
plt.legend()
plt.xticks(rotation=90)  # Rotates the state labels for readability
plt.show()




In [None]:
plt.figure(figsize=(15, 6))  # Increase the width as needed

# Plotting fertility rate trend
for year in years_common:
    plt.plot(merged_df['State'], merged_df[f'FertilityRate_{year}'], label=f'Fertility Rate {year}')

plt.xlabel('State')
plt.ylabel('Fertility Rate')
plt.title('Fertility Rate Trend Over Years')
plt.legend()
plt.xticks(rotation=90)
plt.show()

In [None]:
# Correlation analysis
correlation_data = {}
for year in years_common:
    correlation = merged_df[year].corr(merged_df[f'FertilityRate_{year}'])
    correlation_data[year] = correlation

# Convert to a DataFrame for easier visualization
correlation_df = pd.DataFrame(list(correlation_data.items()), columns=['Year', 'Correlation'])
correlation_df.set_index('Year', inplace=True)

# Plotting the correlation over years
correlation_df.plot(kind='bar')
plt.xlabel('Year')
plt.ylabel('Correlation')
plt.title('Correlation Between Income and Fertility Rate Over Years')
plt.show()

In [None]:
correlation_matrix = merged_df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Draw the heatmap
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm')

# Show the plot
plt.show()

In [None]:

# Prepare the data for regression
year_of_interest = '2021'
X = merged_df[year_of_interest].values.reshape(-1, 1)  # Income as independent variable
y = merged_df[f'FertilityRate_{year_of_interest}'].values  # Fertility rate as dependent variable
states = merged_df['State'].values  # State names for hover information

# Initialize the Linear Regression model and fit it
model = LinearRegression()
model.fit(X, y)

# Predictions
predictions = model.predict(X)

# Create a DataFrame for Plotly
plot_data = pd.DataFrame({
    'Income': X.flatten(),
    'FertilityRate': y,
    'State': states,
    'PredictedFertilityRate': predictions
})

# Plotting using Plotly
fig = px.scatter(plot_data, x='Income', y='FertilityRate', text='State',
                 title=f'Regression Analysis for Year {year_of_interest}',
                 labels={'Income': 'Income', 'FertilityRate': 'Fertility Rate'})
fig.add_scatter(x=plot_data['Income'], y=plot_data['PredictedFertilityRate'],
                mode='lines', name='Regression Line')

# Show the plot
fig.show()

# Print model details
r_squared = model.score(X, y)
print(f'Coefficient (Slope): {model.coef_[0]}')
print(f'Intercept: {model.intercept_}')
print(f'R-squared: {r_squared}')

In [None]:
import plotly.express as px

# Filter the dataset for only Mississippi and Massachusetts
filtered_data = merged_df[merged_df['State'].isin(['Utah', 'Louisiana'])]

# Preparing the data for plotting
plot_data = filtered_data.melt(id_vars='State', value_vars=[f'FertilityRate_{year}' for year in years_common],
                               var_name='Year', value_name='FertilityRate')

# Converting 'Year' to a more readable format (removing 'FertilityRate_' prefix)
plot_data['Year'] = plot_data['Year'].str.replace('FertilityRate_', '')

# Plotting using Plotly
fig = px.line(plot_data, x='Year', y='FertilityRate', color='State',
              title='Fertility Rate Trends for Utah and Louisiana',
              labels={'Year': 'Year', 'FertilityRate': 'Fertility Rate'})

# Show the plot
fig.show()


In [None]:
plot_data.info()

In [None]:
import plotly.express as px

# Assuming the first and last years in your dataset are 2010 and 2021
first_year = '2010'
last_year = '2021'

# Calculate the change in fertility rate
merged_df['FertilityRateChange'] = merged_df[f'FertilityRate_{last_year}'] - merged_df[f'FertilityRate_{first_year}']

# Sort states by the change in fertility rate
sorted_df = merged_df[['State', 'FertilityRateChange']].sort_values(by='FertilityRateChange')

# Plotting using Plotly
fig = px.bar(sorted_df, x='State', y='FertilityRateChange',
             title='Change in Fertility Rates from 2010 to 2021 by State',
             labels={'State': 'State', 'FertilityRateChange': 'Change in Fertility Rate'},
             color='FertilityRateChange',  # Color can represent the magnitude of change
             orientation='v')  # Vertical bar chart

# Show the plot
fig.show()


In [None]:
new_column_names = {year: f'income_{year}' for year in merged_df.columns if year.isdigit()}
merged_df.rename(columns=new_column_names, inplace=True)
merged_df.info()

In [None]:
folder_path = '/content/drive/My Drive/STATE_OF_THE_STATES/'
merged_df.to_csv(folder_path + 'income_fertility.csv', index=False)

In [None]:
for col in merged_df:
  print(merged_df[col].isna().sum())

In [None]:
merged_df.mean()

In [None]:
party_url = "https://www.jagranjosh.com/general-knowledge/red-and-blue-states-in-us-1701677972-1"
party_df = pd.read_html(party_url)
party_df = party_df[0]

In [None]:
party_df.head()