In [89]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
from scipy import stats
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression 
from sklearn.metrics import r2_score

pd.options.plotting.backend = "plotly"



#suppress pandas SettingWithCopyWarning

pd.options.mode.chained_assignment = None

In [90]:
df1 = pd.read_csv('../econforecasting/data/GDP.csv')
df2 = pd.read_csv('../econforecasting/data/GDPC1.csv')
df3 =  pd.read_csv('../econforecasting/data/JTS3400JOL.csv')
df4 = pd.read_csv('../econforecasting/data/LNS13026638.csv')
df5 = pd.read_csv('../econforecasting/data/LNS14000024.csv')
df6 = pd.read_csv('../econforecasting/data/LNS14032183.csv')
df7 = pd.read_csv('../econforecasting/data/MICH.csv')
df8 = pd.read_csv('../econforecasting/data/NROU.csv')
df9 = pd.read_csv('../econforecasting/data/UNRATE.csv')
df10 = pd.read_csv('../econforecasting/data/USARECD.csv')
df11 = pd.read_csv('../econforecasting/data/USRECD.csv')
df12 = pd.read_csv('../econforecasting/data/CORESTICKM159SFRBATL.csv')
df13 = pd.read_csv('../econforecasting/data/CPIAUCSL.csv')
df14 = pd.read_csv('../econforecasting/data/CUUR0000SA0R.csv')
df15 = pd.read_csv('../econforecasting/data/A939RX0Q048SBEA.csv')


In [91]:
df = df1

# Iterate through the remaining dataframes and merge them
for i in range(2, 16):  # range(2, 16)
    df = pd.merge(df, globals()[f'df{i}'], how='outer', on='DATE')

df = df.rename(columns={
    'GDPC1': 'Real GDP',
    'JTS3400JOL': 'Job Openings in Manufacturing',
    'LNS13026638': 'Permanent Job Losers',
    'LNS14000024': 'Unempl: 20y+',
    'LNS14032183': 'Unempl: Asian',
    'MICH': 'UofM Infl Expectation',
    'NROU': 'Unempl Noncyclical',
    'UNRATE' : 'Unempl Rate',
    'USRECD': 'NBER Recess Indic',
    'USARECD': 'OECD Recess Indic',
    'A939RX0Q048SBEA': 'Real GDP per capita',
    'CORESTICKM159SFRBATL': 'Sticky-Price CPI',
    'CPIAUCSL': 'All Urban Consumer CPI',
    'CUUR0000SA0R': 'Purch. Power of Consumer Dollar in cities',
})


df['DATE'] = pd.to_datetime(df['DATE'])
df.interpolate(method="akima")
#df.fillna(df.mean(), inplace=True)
df.dropna()
df.head()

Unnamed: 0,DATE,GDP,Real GDP,Job Openings in Manufacturing,Permanent Job Losers,Unempl: 20y+,Unempl: Asian,UofM Infl Expectation,Unempl Noncyclical,Unempl Rate,OECD Recess Indic,NBER Recess Indic,Sticky-Price CPI,All Urban Consumer CPI,Purch. Power of Consumer Dollar in cities,Real GDP per capita
0,1913-01-01,,,,,,,,,,,,,,1017.8,
1,1913-02-01,,,,,,,,,,,,,,1021.4,
2,1913-03-01,,,,,,,,,,,,,,1021.4,
3,1913-04-01,,,,,,,,,,,,,,1017.8,
4,1913-05-01,,,,,,,,,,,,,,1025.0,


In [92]:

#df['MONTH'] = df['DATE'].dt.month_name()
#unempl_df = df.groupby('')['GDP'].mean().reset_index()

fig = px.scatter(df, x = "Job Openings in Manufacturing", y = 'Unempl: 20y+')
          
fig.update_traces(marker_color='green')

fig.show()


In [98]:

df_reg = df[['Job Openings in Manufacturing', 'Unempl: 20y+']]
x = df_reg[['Job Openings in Manufacturing']]
y = df_reg[['Unempl: 20y+']]

df_reg.dropna(inplace=True)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 101)

model = LinearRegression()
model.fit(X_train, y_train)

predictions = model.predict(X_test)

r2_score_val = r2_score(y_test, predictions)
print("R^2 Score:", r2_score_val)

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [85]:
df['x_zscore'] = stats.zscore(df['Job Openings in Manufacturing'])
df['y_zscore'] = stats.zscore(df['Unempl: 20y+'])

# Step 2: Identify outliers (Z-score > 3 or < -3)
df['is_outlier'] = (df['x_zscore'].abs() > 1) | (df['y_zscore'].abs() > 1)

# Step 3: Create a new dataset without outliers
df_cleaned = df[~df['is_outlier']]

fig2 = px.scatter(df, x = "Job Openings in Manufacturing", y = 'Unempl: 20y+')
        
fig2.update_traces(marker_color='green')
fig2.show()


In [86]:

df['YEAR'] = df['DATE'].dt.year
month_df = df.groupby('YEAR')['Unempl: 20y+'].mean().reset_index()

fig = go.Figure()


fig.add_trace(go.Scatter(x = month_df["YEAR"], y = month_df['Unempl: 20y+'], mode= "lines"))
fig.add_trace(go.Scatter(x = df['YEAR'], y = df['Unempl Rate'], mode ='lines'))       
fig.update_traces(marker_color='green')
fig.show()

In [87]:
df['YEAR'] = df['DATE'].dt.year
month_df = df.groupby('YEAR')['Unempl Rate'].mean().reset_index()

fig = go.Figure()


fig.add_trace(go.Scatter(x = month_df["YEAR"], y = month_df['Unempl Rate'], mode= "lines"))