In [None]:
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as py 
import seaborn as sns
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
#Import data
indices_data = Path('Datasets/INDICES_DATA.csv', low_memory=False)
covid_data = Path('Datasets/owid-covid-data.csv')

#Read in data
indices_data_df = pd.read_csv(indices_data)
covid_data_df = pd.read_csv(covid_data)

In [None]:
indices_data_df.head()

In [None]:
covid_data_df.head()

In [None]:
covid_data_df = covid_data_df.rename(columns={
    'date':             'Date',
    'location':         'Location',
    'total_cases':      'Total Cases',
    'total_deaths':     'Total Deaths',
    'population':       'Population'
})
covid_data_df.head()

In [None]:
#Make the first row of the indices dataframe the headers
new_header = indices_data_df.iloc[0]
indices_data_df = indices_data_df[1:]
indices_data_df.columns = new_header

indices_data_df.head()

In [None]:
#USA- S&P 500 dataset creation
snp_df = indices_data_df.iloc[:, :7]
snp_df.head(30)

In [None]:
snp_df['Date'] = pd.to_datetime(snp_df['Date'])
snp_df['Close'] = snp_df['Close'].astype(float)
plt.figure(figsize=(20,10))
x = snp_df['Date']
y = snp_df['Close']
plt.plot(x, y)
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title('S&P 500 Index - Closing Prices')
plt.grid(True)
plt.savefig('S&P 500 Index - Closing Prices.png.png', format='png', dpi=300)
plt.show()

In [None]:
# clean the covid dataframe
covid_data_df = covid_data_df[["Location", "Date", "Total Cases", "Total Deaths", "Population"]]
covid_data_df = covid_data_df.sort_values(by='Total Cases', ascending=False)
covid_data_df.head(10)

In [None]:
us_data = covid_data_df[covid_data_df["Location"] == "United States"]
us_data.head()



In [None]:
china_data = covid_data_df[covid_data_df["Location"] == "China"]
china_data.head()

In [None]:
india_data = covid_data_df[covid_data_df["Location"] == "India"]
india_data.head()


In [None]:
france_data = covid_data_df[covid_data_df["Location"] == "France"]
france_data.head()


In [None]:
germany_data = covid_data_df[covid_data_df["Location"] == "Germany"]
germany_data.head()

In [None]:
#Average of total Covid cases for top 5 highest countries

mean_total_us_cases = us_data["Total Cases"].mean()
print("Mean total cases in the United States are")
print(mean_total_us_cases)

mean_total_china_cases = china_data["Total Cases"].mean()
print("Mean total cases in China are")
print(mean_total_china_cases)

mean_total_india_cases = india_data["Total Cases"].mean()
print("Mean total cases in India are")
print(mean_total_india_cases)

mean_total_france_cases = france_data["Total Cases"].mean()
print("Mean total cases in France are")
print(mean_total_france_cases)

mean_total_germany_cases = germany_data["Total Cases"].mean()
print("Mean total cases in Germany are")
print(mean_total_germany_cases)


In [None]:
#Graph Total cases for each country

us_data['Date'] = pd.to_datetime(us_data['Date'])
china_data['Date'] = pd.to_datetime(china_data['Date'])
india_data['Date'] = pd.to_datetime(india_data['Date'])
france_data['Date'] = pd.to_datetime(france_data['Date'])
germany_data['Date'] = pd.to_datetime(germany_data['Date'])

plt.figure(figsize=(20, 10))
plt.plot(us_data['Date'], us_data['Total Cases'], label='USA')
plt.plot(china_data['Date'], china_data['Total Cases'], label='China')
plt.plot(india_data['Date'], india_data['Total Cases'], label='India')
plt.plot(france_data['Date'], france_data['Total Cases'], label='France')
plt.plot(germany_data['Date'], germany_data['Total Cases'], label='Germany')

plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.title('Total COVID-19 Cases in Countries Over Time')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.savefig('Total COVID-19 Cases in Countries Over Time.png', format='png', dpi=300)
plt.show()

In [None]:
#Bar graph for Average Prevalence Rate in country. 

covid_data_df['Prevalence Rate'] = covid_data_df['Total Cases'] / covid_data_df['Population']

us_data = covid_data_df[covid_data_df["Location"] == "United States"]
china_data = covid_data_df[covid_data_df["Location"] == "China"]
india_data = covid_data_df[covid_data_df["Location"] == "India"]
france_data = covid_data_df[covid_data_df["Location"] == "France"]
germany_data = covid_data_df[covid_data_df["Location"] == "Germany"]

prevalence_data = {
    'Country': ['USA', 'China', 'India', 'France', 'Germany'],
    'Prevalence Rate': [
        us_data['Prevalence Rate'].mean(),
        china_data['Prevalence Rate'].mean(),
        india_data['Prevalence Rate'].mean(),
        france_data['Prevalence Rate'].mean(),
        germany_data['Prevalence Rate'].mean()
    ]
}

prevalence_df = pd.DataFrame(prevalence_data)


plt.figure(figsize=(12, 8))
plt.bar(prevalence_df['Country'], prevalence_df['Prevalence Rate'], color=['blue', 'red', 'green', 'purple', 'orange'])
plt.xlabel('Location')
plt.ylabel('Prevalence Rate')
plt.title('Average Prevalence Rate based on Country')
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('Average Prevalence Rate based on Country.png', format='png', dpi=300)
plt.show()

In [None]:
# Total cases bar graph  
covid_data_df['Prevalence Rate'] = covid_data_df['Total Cases'] / covid_data_df['Population']

us_data = covid_data_df[covid_data_df["Location"] == "United States"]
china_data = covid_data_df[covid_data_df["Location"] == "China"]
india_data = covid_data_df[covid_data_df["Location"] == "India"]
france_data = covid_data_df[covid_data_df["Location"] == "France"]
germany_data = covid_data_df[covid_data_df["Location"] == "Germany"]
total_cases_data = {
    'Country': ['USA', 'China', 'India', 'France', 'Germany'],
    'Total Cases': [
        us_data['Total Cases'].max(),
        china_data['Total Cases'].max(),
        india_data['Total Cases'].max(),
        france_data['Total Cases'].max(),
        germany_data['Total Cases'].max()
    ]
}

total_cases_df = pd.DataFrame(total_cases_data)


plt.figure(figsize=(12, 8))
plt.bar(total_cases_df['Country'], total_cases_df['Total Cases'], color=['blue', 'red', 'green', 'purple', 'orange'])
plt.xlabel('Location')
plt.ylabel('Total Cases')
plt.title('Total Cases based on Country')
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('Total Cases based on Country.png', format='png', dpi=300)
plt.show()

In [None]:
# groupby total cases by country
total_cases_by_country = covid_data_df.groupby('Location')['Total Cases'].max().sort_values(ascending=False)
top_10_countries = total_cases_by_country.head(10).reset_index()
top_10_countries.head(30)

In [None]:
# count the amount of entries for each country
countries = covid_data_df['Location'].value_counts(dropna=False)
print(countries)


In [None]:
# single out location and population
location_population = covid_data_df[['Location', 'Population']]
location_population

In [None]:
# groupby location and population
location_population_max = location_population.groupby('Location').max()
location_population_max

In [None]:
# sort population
location_population_max = location_population_max.sort_values('Population', ascending=False)
location_population_max

In [None]:
# drop unneeded columns
location_population_max = location_population_max.drop('Lower middle income')
location_population_max = location_population_max.drop('Upper middle income')
location_population_max = location_population_max.drop('World')
location_population_max = location_population_max.drop('Asia')
location_population_max = location_population_max.drop('Africa')
location_population_max = location_population_max.drop('High income')
location_population_max = location_population_max.drop('Low income')
location_population_max = location_population_max.drop('European Union')
location_population_max = location_population_max.drop('North America')
location_population_max.head(30)

In [None]:
# drop unneeded rows and group by location, total cases, date
total_cases = covid_data_df[['Location', 'Total Cases', 'Date']]
total_cases_max = total_cases.groupby('Location').max()
total_cases_max = total_cases_max.sort_values('Total Cases', ascending=False)
total_cases_max = total_cases_max.drop('Lower middle income',)
total_cases_max = total_cases_max.drop('Upper middle income')
total_cases_max = total_cases_max.drop('World')
total_cases_max = total_cases_max.drop('Asia')
total_cases_max = total_cases_max.drop('Africa')
total_cases_max = total_cases_max.drop('High income')
total_cases_max = total_cases_max.drop('Low income')
total_cases_max = total_cases_max.drop('European Union')
total_cases_max = total_cases_max.drop('North America')
total_cases_max = total_cases_max.drop('Europe')
total_cases_max = total_cases_max.drop('South America')
total_cases_max.head(40)

In [None]:
# convert date to datetime
total_cases_max['Date'] = pd.to_datetime(total_cases_max['Date'])
print(total_cases_max.dtypes)
total_cases_max.head()

In [None]:
# index by date
total_cases_max = total_cases_max.reset_index()
date_total_cases = total_cases_max.set_index('Date')
date_total_cases.head(10)

In [None]:
#China - SSE Composite Index dataset as a subset of the indices dataframe 
sse_df = indices_data_df.iloc[:, [0] + list(range(113,119))]
sse_df.head(130)

In [None]:
#India - S&P BSE SENSEX dataset as a subset of the indices dataframe 
bse_df = indices_data_df.iloc[:, [0] + list(range(141,147))]
bse_df.head(130)

In [None]:
#France - CAC 40 dataset as a subset of the indices dataframe 
cac_df = indices_data_df.iloc[:, [0] + list(range(64,70))]
cac_df.head(130)

In [None]:
#Germany - DAX Performance-Index dataset as a subset of the indices dataframe 
dax_df = indices_data_df.iloc[:, [0] + list(range(57,63))]
dax_df.head(130)

In [None]:
#Convert Date columns to datetime
snp_df['Date'] = pd.to_datetime(snp_df['Date'])
sse_df['Date'] = pd.to_datetime(sse_df['Date'])
bse_df['Date'] = pd.to_datetime(bse_df['Date'])
cac_df['Date'] = pd.to_datetime(cac_df['Date'])
dax_df['Date'] = pd.to_datetime(dax_df['Date'])

#Convert Close data to a float datatype
snp_df['Close'] = snp_df['Close'].astype(float)
sse_df['Close'] = sse_df['Close'].astype(float)
bse_df['Close'] = bse_df['Close'].astype(float)
cac_df['Close'] = cac_df['Close'].astype(float)
dax_df['Close'] = dax_df['Close'].astype(float)

#Plot the graph
plt.figure(figsize=(20,10))

#Plot the index dataframes
plt.plot(snp_df['Date'], snp_df['Close'], label='USA - S&P 500 Index')
plt.plot(sse_df['Date'], sse_df['Close'], label='China - SSE Composite Index')
plt.plot(bse_df['Date'], bse_df['Close'], label='India - BSE SENSEX Index')
plt.plot(cac_df['Date'], cac_df['Close'], label='France - CAC 40')
plt.plot(dax_df['Date'], dax_df['Close'], label='Germany - DAX Index')

#Create the graph details
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title('Stock Market Indices - Closing Prices')
plt.grid(True)
plt.legend()
plt.savefig('Stock Market Indices - Closing Prices.png', format='png', dpi=300)
plt.show()

In [None]:
# Filter dataframes for a specific date range (e.g., January 1, 2023 to December 31, 2023)
start_date = '2020-01-01'
end_date = '2021-12-31'

snp_df_filtered = snp_df[(snp_df['Date'] >= start_date) & (snp_df['Date'] <= end_date)]
sse_df_filtered = sse_df[(sse_df['Date'] >= start_date) & (sse_df['Date'] <= end_date)]
bse_df_filtered = bse_df[(bse_df['Date'] >= start_date) & (bse_df['Date'] <= end_date)]
cac_df_filtered = cac_df[(cac_df['Date'] >= start_date) & (cac_df['Date'] <= end_date)]
dax_df_filtered = dax_df[(dax_df['Date'] >= start_date) & (dax_df['Date'] <= end_date)]

# Plot the graph
plt.figure(figsize=(20,10))

# Plot the index dataframes
plt.plot(snp_df_filtered['Date'], snp_df_filtered['Close'], label='USA - S&P 500 Index')
plt.plot(sse_df_filtered['Date'], sse_df_filtered['Close'], label='China - SSE Composite Index')
plt.plot(bse_df_filtered['Date'], bse_df_filtered['Close'], label='India - BSE SENSEX Index')
plt.plot(cac_df_filtered['Date'], cac_df_filtered['Close'], label='France - CAC 40')
plt.plot(dax_df_filtered['Date'], dax_df_filtered['Close'], label='Germany - DAX Index')

# Create the graph details
plt.xlabel('Date')
plt.ylabel('Closing Price')
plt.title('Stock Market Indices - Closing Prices (Jan 1, 2020 to Dec 31, 2020)')
plt.grid(True)
plt.legend()
plt.savefig('Stock Market Indices - Closing Prices (Jan 1, 2020 to Dec 31, 2020).png', format='png', dpi=300)
plt.show()

In [None]:
snp_df_filtered['Index'] = 'S&P 500'
sse_df_filtered['Index'] = 'SSE Composite Index'
bse_df_filtered['Index'] = 'BSE Sensex'
cac_df_filtered['Index'] = 'CAC 40'
dax_df_filtered['Index'] = 'DAX Performance'
filtered_merged_indices = pd.concat([snp_df_filtered,sse_df_filtered,bse_df_filtered,cac_df_filtered,dax_df_filtered], ignore_index=True)
filtered_merged_indices.head()

In [None]:
#Mergining the indices dataframes
snp_df['Index'] = 'S&P 500'
sse_df['Index'] = 'SSE Composite Index'
bse_df['Index'] = 'BSE Sensex'
cac_df['Index'] = 'CAC 40'
dax_df['Index'] = 'DAX Performance'

In [None]:
#merged indices dataframe
merged_indices_df = pd.concat([snp_df, sse_df,bse_df,cac_df,dax_df], ignore_index=True)
merged_indices_df.head()

In [None]:
covid_data_df['Date'] = pd.to_datetime(covid_data_df['Date'])
covid_data_df.head()

In [None]:
merged_indices_df.head()

In [None]:
index_list = filtered_merged_indices['Index'].value_counts().index.tolist()

In [None]:
united_index = ['United States','China','India','France','Germany']

In [None]:
location_dict = dict(zip(index_list,united_index))
location_dict

In [None]:
filtered_merged_indices['Location'] = filtered_merged_indices['Index'].apply(lambda x: location_dict[x])
filtered_merged_indices.head()

In [None]:
pd.set_option('display.max_rows', None)

In [None]:
merged_indices_and_covid = pd.merge(filtered_merged_indices, covid_data_df, on=['Date','Location'], how='left')
merged_indices_and_covid.head(50)

In [None]:
# Total Covid cases/country over time in scatter plot



us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "United States"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "China"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "India"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "France"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "Germany"]

us_merged_data['Date'] = pd.to_datetime(us_merged_data['Date'])
china_merged_data['Date'] = pd.to_datetime(china_merged_data['Date'])
india_merged_data['Date'] = pd.to_datetime(india_merged_data['Date'])
france_merged_data['Date'] = pd.to_datetime(france_merged_data['Date'])
germany_merged_data['Date'] = pd.to_datetime(germany_merged_data['Date'])

plt.figure(figsize=(20, 10))
plt.plot(us_merged_data['Date'], us_merged_data['Total Cases'], label='USA')
plt.plot(china_merged_data['Date'], china_merged_data['Total Cases'], label='China')
plt.plot(india_merged_data['Date'], india_merged_data['Total Cases'], label='India')
plt.plot(france_merged_data['Date'], france_merged_data['Total Cases'], label='France')
plt.plot(germany_merged_data['Date'], germany_merged_data['Total Cases'], label='Germany')

plt.xlabel('Date')
plt.ylabel('Total Cases')
plt.title('Total COVID-19 Cases in Countries Over Time')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.legend(loc='upper left')
plt.savefig('Total COVID-19 Cases in Countries Over Time.png', format='png', dpi=300)
plt.show()


In [None]:
# Total Covid cases/country over time in bar graph 

us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "United States"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "China"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "India"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "France"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "Germany"]

us_merged_data['Date'] = pd.to_datetime(us_merged_data['Date'])
china_merged_data['Date'] = pd.to_datetime(china_merged_data['Date'])
india_merged_data['Date'] = pd.to_datetime(india_merged_data['Date'])
france_merged_data['Date'] = pd.to_datetime(france_merged_data['Date'])
germany_merged_data['Date'] = pd.to_datetime(germany_merged_data['Date'])

fig, ax = plt.subplots(figsize=(20, 10))
bar_width = 5  

ax.bar(us_merged_data['Date'] - pd.Timedelta(days=10), us_merged_data['Total Cases'], width=bar_width, label='USA')
ax.bar(china_merged_data['Date'] - pd.Timedelta(days=5), china_merged_data['Total Cases'], width=bar_width, label='China')
ax.bar(india_merged_data['Date'] - pd.Timedelta(days=5), india_merged_data['Total Cases'], width=bar_width, label='India')
ax.bar(france_merged_data['Date'] + pd.Timedelta(days=5), france_merged_data['Total Cases'], width=bar_width, label='France')
ax.bar(germany_merged_data['Date'] + pd.Timedelta(days=10), germany_merged_data['Total Cases'], width=bar_width, label='Germany')

ax.set_xlabel('Date')
ax.set_ylabel('Total Cases')
ax.set_title('Total COVID-19 Cases in Countries Over Time')
ax.legend(loc='upper left')

plt.xticks(rotation=45)
plt.grid(True, which='both', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('Total COVID-19 Cases in Countries Over Time.png', format='png', dpi=300)
plt.show() 

In [None]:
#Prevalence rate based on country stock index

merged_indices_and_covid['Prevalence Rate'] = merged_indices_and_covid['Total Cases'] / merged_indices_and_covid['Population']

us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "United States"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "China"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "India"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "France"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "Germany"]

us_merged_data['Date'] = pd.to_datetime(us_merged_data['Date'])
china_merged_data['Date'] = pd.to_datetime(china_merged_data['Date'])
india_merged_data['Date'] = pd.to_datetime(india_merged_data['Date'])
france_merged_data['Date'] = pd.to_datetime(france_merged_data['Date'])
germany_merged_data['Date'] = pd.to_datetime(germany_merged_data['Date'])

merged_indices_and_covid = merged_indices_and_covid.dropna(subset=['Prevalence Rate', 'Index'])

plt.figure(figsize=(10, 6))
plt.scatter(merged_indices_and_covid['Prevalence Rate'], merged_indices_and_covid['Index'], alpha=0.5)
plt.xlabel('Prevalence Rate')
plt.ylabel('Stock Index')
plt.title('Prevalence Rate based on Stock Index Performance')
plt.savefig('Prevalence Rate based on Stock Index Performance.png', format='png', dpi=300 )
plt.show()



In [None]:
#Prevalence rate vs Stock index comparison

merged_indices_and_covid['Prevalence Rate'] = merged_indices_and_covid['Total Cases'] / merged_indices_and_covid['Population']

us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Index"] == "S&P 500"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Index"] == "SSE Composite Index"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Index"] == "BSE Sensex"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Index"] == "CAC 40"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Index"] == "Dax Performance"]

us_merged_data['Date'] = pd.to_datetime(us_merged_data['Date'])
china_merged_data['Date'] = pd.to_datetime(china_merged_data['Date'])
india_merged_data['Date'] = pd.to_datetime(india_merged_data['Date'])
france_merged_data['Date'] = pd.to_datetime(france_merged_data['Date'])
germany_merged_data['Date'] = pd.to_datetime(germany_merged_data['Date'])

merged_indices_and_covid = merged_indices_and_covid.dropna(subset=['Prevalence Rate', 'Close'])

plt.figure(figsize=(10, 6))
plt.scatter(us_merged_data['Close'], us_merged_data['Prevalence Rate'], label='USA', alpha=0.5)
plt.scatter(china_merged_data['Close'], china_merged_data['Prevalence Rate'], label='China', alpha=0.5)
plt.scatter(india_merged_data['Close'], india_merged_data['Prevalence Rate'], label='India', alpha=0.5)
plt.scatter(france_merged_data['Close'], france_merged_data['Prevalence Rate'], label='France', alpha=0.5)
plt.scatter(germany_merged_data['Close'], germany_merged_data['Prevalence Rate'], label='Germany', alpha=0.5)
plt.xlabel('Stock Index')
plt.ylabel('Prevalence Rate')
plt.title('Stock Index Performance vs Prevalence Rate by Country')
plt.legend()
plt.show()


plt.figure(figsize=(10, 6))
plt.scatter(merged_indices_and_covid['Prevalence Rate'], merged_indices_and_covid['Close'], alpha=0.5)


plt.xlabel('Prevalence Rate')
plt.ylabel('Stock Index')
plt.title('Stock Index Performance vs Prevalence Rate')
plt.legend
plt.savefig('Stock Index Performance vs Prevalence Rate.png', format='png', dpi=300 )
plt.show()

In [None]:
merged_indices_and_covid['Prevalence Rate'] = merged_indices_and_covid['Total Cases'] / merged_indices_and_covid['Population']

us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "United States"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "China"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "India"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "France"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "Germany"]

us_merged_data['Date'] = pd.to_datetime(us_merged_data['Date'])
china_merged_data['Date'] = pd.to_datetime(china_merged_data['Date'])
india_merged_data['Date'] = pd.to_datetime(india_merged_data['Date'])
france_merged_data['Date'] = pd.to_datetime(france_merged_data['Date'])
germany_merged_data['Date'] = pd.to_datetime(germany_merged_data['Date'])

merged_indices_and_covid = merged_indices_and_covid.dropna(subset=['Prevalence Rate', 'Index'])

plt.figure(figsize=(20, 10))
plt.plot(us_merged_data['Index'], us_merged_data['Prevalence Rate'], label='USA')
plt.plot(china_merged_data['Index'], china_merged_data['Prevalence Rate'], label='China')
plt.plot(india_merged_data['Index'], india_merged_data['Prevalence Rate'], label='India')
plt.plot(france_merged_data['Index'], france_merged_data['Prevalence Rate'], label='France')
plt.plot(germany_merged_data['Index'], germany_merged_data['Prevalence Rate'], label='Germany')

plt.scatter(merged_indices_and_covid['Index'], merged_indices_and_covid['Prevalence Rate'], alpha=0.5)
plt.xlabel('Stock Index')
plt.ylabel('Prevalence Rate')
plt.title('Stock Index vs Prevalence Rate')
plt.savefig('Stock Index vs Prevalence Rate.png', format='png', dpi=300 )
plt.show()


In [None]:
merged_indices_and_covid['Prevalence Rate'] = merged_indices_and_covid['Total Cases'] / merged_indices_and_covid['Population']

us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "United States"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "China"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "India"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "France"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "Germany"]

prevalence_data = {
    'Country': ['USA', 'China', 'India', 'France', 'Germany'],
    'Prevalence Rate': [
        us_merged_data['Prevalence Rate'].mean(),
        china_merged_data['Prevalence Rate'].mean(),
        india_merged_data['Prevalence Rate'].mean(),
        france_merged_data['Prevalence Rate'].mean(),
        germany_merged_data['Prevalence Rate'].mean()
    ]
}

plt.figure(figsize=(12, 8))
plt.bar(merged_indices_and_covid['Location'], merged_indices_and_covid['Prevalence Rate'], color=['blue', 'red', 'green', 'purple', 'orange'])
plt.xlabel('Location')
plt.ylabel('Prevalence Rate')
plt.title('Prevalence Rate based on Country')
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('Prevalence Rate based on Country.png', format='png', dpi=300 )
plt.show()




In [None]:
merged_indices_and_covid['Prevalence Rate'] = merged_indices_and_covid['Total Cases'] / merged_indices_and_covid['Population']

us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "United States"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "China"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "India"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "France"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "Germany"]

prevalence_data = {
    'Country': ['USA', 'China', 'India', 'France', 'Germany'],
    'Prevalence Rate': [
        us_merged_data['Prevalence Rate'].mean(),
        china_merged_data['Prevalence Rate'].mean(),
        india_merged_data['Prevalence Rate'].mean(),
        france_merged_data['Prevalence Rate'].mean(),
        germany_merged_data['Prevalence Rate'].mean()
    ]
}

plt.figure(figsize=(12, 8))
plt.bar(merged_indices_and_covid['Location'], merged_indices_and_covid['Total Cases'], color=['blue', 'red', 'green', 'purple', 'orange'])
plt.xlabel('Location')
plt.ylabel('Total Cases')
plt.title('Total Cases based on Country')
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()
plt.savefig('Total Cases based on Country.png', format='png', dpi=300 )
plt.show()

In [None]:
#Prevalence Rate over Time

merged_indices_and_covid['Prevalence Rate'] = merged_indices_and_covid['Total Cases'] / merged_indices_and_covid['Population']

us_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "United States"]
china_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "China"]
india_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "India"]
france_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "France"]
germany_merged_data = merged_indices_and_covid[merged_indices_and_covid["Location"] == "Germany"]

us_merged_data['Date'] = pd.to_datetime(us_merged_data['Date'])
china_merged_data['Date'] = pd.to_datetime(china_merged_data['Date'])
india_merged_data['Date'] = pd.to_datetime(india_merged_data['Date'])
france_merged_data['Date'] = pd.to_datetime(france_merged_data['Date'])
germany_merged_data['Date'] = pd.to_datetime(germany_merged_data['Date'])

merged_indices_and_covid = merged_indices_and_covid.dropna(subset=['Date', 'Prevalence Rate'])

plt.figure(figsize=(20, 10))
plt.plot(us_merged_data['Date'], us_merged_data['Prevalence Rate'], label='USA')
plt.plot(china_merged_data['Date'], china_merged_data['Prevalence Rate'], label='China')
plt.plot(india_merged_data['Date'], india_merged_data['Prevalence Rate'], label='India')
plt.plot(france_merged_data['Date'], france_merged_data['Prevalence Rate'], label='France')
plt.plot(germany_merged_data['Date'], germany_merged_data['Prevalence Rate'], label='Germany')

markers = ['o', 's', 'D', '^', 'P']
colors = ['blue', 'green', 'red', 'purple', 'orange']
plt.scatter(merged_indices_and_covid['Date'], merged_indices_and_covid['Prevalence Rate'], alpha=0.5)
plt.xlabel('Date')
plt.ylabel('Prevalence Rate')
plt.title('Prevalence Rate vs. Time')
plt.legend(loc = 'upper left')
plt.tight_layout()
plt.savefig('Prevalence Rate vs. Time.png', format='png', dpi=300 )
plt.show()

In [None]:
# Filter data for date range from 2020-01-01 to 2021-12-31
start_date = '2020-01-01'
end_date = '2020-12-31'
merged_indices_and_covid['Date'] = pd.to_datetime(merged_indices_and_covid['Date'])
merged_indices_and_covid = merged_indices_and_covid[(merged_indices_and_covid['Date'] >= start_date) & (merged_indices_and_covid['Date'] <= end_date)]

# Drop rows with NaN values in 'Total Cases' and 'Volume'
merged_indices_and_covid.dropna(subset=['Total Cases', 'Volume'], inplace=True)

# Convert 'Total Cases' to integer (assuming it's meant to be an integer)
merged_indices_and_covid['Total Cases'] = merged_indices_and_covid['Total Cases'].astype(int)

# Plotting a scatter plot with 'Date' on x-axis, 'Total Cases' and 'Volume' on y-axis
plt.figure(figsize=(12, 6))

# Define markers and colors for different locations
markers = ['o', 's', 'D', '^', 'P']
colors = ['blue', 'green', 'red', 'purple', 'orange']

# Loop through unique locations and plot each with different marker and color
for i, (location, color, marker) in enumerate(zip(merged_indices_and_covid['Location'].unique(), colors, markers)):
    location_data = merged_indices_and_covid[merged_indices_and_covid['Location'] == location]
    plt.scatter(location_data['Date'], location_data['Total Cases'], color=color, label=location, marker=marker, alpha=0.7)

# Formatting x-axis to show date labels nicely
plt.gca().xaxis.set_major_formatter(plt.matplotlib.dates.DateFormatter('%Y-%m-%d'))
plt.gca().xaxis.set_major_locator(plt.matplotlib.dates.MonthLocator())

# Adding grid, legends, title, and labels
plt.grid(True)
plt.title('Scatter Plot: Total Cases and Volume over Time (2020-01-01 to 2021-12-31)')
plt.xlabel('Date')
plt.ylabel('Total Cases')

# Rotating x-axis labels for better readability
plt.xticks(rotation=45)

# Showing legends
plt.legend(loc='upper left')

# Automatically adjust layout for better fit
plt.tight_layout()

plt.savefig('Scatter Plot: Total Cases and Volume over Time (2020-01-01 to 2021-12-31).png', format='png', dpi=300 )

# Display the plot
plt.show()

# Group by 'Location' and calculate sum of 'Total Cases' and 'Volume'
sums_per_location = merged_indices_and_covid.groupby('Location').agg({'Total Cases': 'sum', 'Volume': 'sum'}).reset_index()

# Print the sums for each location
print("Sums for each location:")
print(sums_per_location)

# List of locations
locations = ['United States', 'China', 'India', 'France', 'Germany']

# Loop through each location
for location in locations:
    # Filter for February 3, 2020
    filtered_feb_3_2020 = merged_indices_and_covid[(merged_indices_and_covid['Date'] == '2020-02-03') & (merged_indices_and_covid['Location'] == location)]
    if not filtered_feb_3_2020.empty:
        close_feb_3_2020 = filtered_feb_3_2020['Close'].values[0]
        close_feb_3_2020_rounded = round(close_feb_3_2020)
    else:
        close_feb_3_2020_rounded = None

    # Filter for July 1, 2020
    filtered_july_1_2020 = merged_indices_and_covid[(merged_indices_and_covid['Date'] == '2020-07-01') & (merged_indices_and_covid['Location'] == location)]
    if not filtered_july_1_2020.empty:
        close_july_1_2020 = filtered_july_1_2020['Close'].values[0]
        close_july_1_2020_rounded = round(close_july_1_2020)
    else:
        close_july_1_2020_rounded = None

    # Filter for December 30, 2020
    filtered_dec_30_2020 = merged_indices_and_covid[(merged_indices_and_covid['Date'] == '2020-12-30') & (merged_indices_and_covid['Location'] == location)]
    if not filtered_dec_30_2020.empty:
        close_dec_30_2020 = filtered_dec_30_2020['Close'].values[0]
        close_dec_30_2020_rounded = round(close_dec_30_2020)
    else:
        close_dec_30_2020_rounded = None

    # Print the results
    print()
    print(f"Location: {location}")
    if close_feb_3_2020_rounded is not None:
        print(f"Closing price on February 3, 2020: {close_feb_3_2020_rounded}")
    else:
        print("No data available for February 3, 2020")

    if close_july_1_2020_rounded is not None:
        print(f"Closing price on July 1, 2020: {close_july_1_2020_rounded}")
    else:
        print("No data available for July 1, 2020")

    if close_dec_30_2020_rounded is not None:
        print(f"Closing price on December 30, 2020: {close_dec_30_2020_rounded}")
    else:
        print("No data available for December 30, 2020")

 
    

In [None]:
for location in locations:
    # Filter for February 3, 2020
    filtered_feb_3_2020 = merged_indices_and_covid[(merged_indices_and_covid['Date'] == '2020-02-03') & (merged_indices_and_covid['Location'] == location)]
    if not filtered_feb_3_2020.empty:
        close_feb_3_2020 = filtered_feb_3_2020['Close'].values[0]
    else:
        close_feb_3_2020 = None

    # Filter for December 30, 2020
    filtered_dec_30_2020 = merged_indices_and_covid[(merged_indices_and_covid['Date'] == '2020-12-30') & (merged_indices_and_covid['Location'] == location)]
    if not filtered_dec_30_2020.empty:
        close_dec_30_2020 = filtered_dec_30_2020['Close'].values[0]
    else:
        close_dec_30_2020 = None

    # Calculate percent difference if both values are present
    if close_feb_3_2020 is not None and close_dec_30_2020 is not None:
        percent_difference = ((close_dec_30_2020 - close_feb_3_2020) / close_feb_3_2020) * 100
        percent_difference_rounded = round(percent_difference, 2)  # Round to two decimal places
    else:
        percent_difference_rounded = None

    # Print the results
    print(f"Location: {location}")
    if close_feb_3_2020 is not None:
        print(f"Closing price on February 3, 2020: {close_feb_3_2020:.2f}")
    else:
        print("No data available for February 3, 2020")

    if close_dec_30_2020 is not None:
        print(f"Closing price on December 30, 2020: {close_dec_30_2020:.2f}")
    else:
        print("No data available for December 30, 2020")

    if percent_difference_rounded is not None:
        print(f"Percent difference from February 3, 2020 to December 30, 2020: {percent_difference_rounded}%")
    else:
        print("Unable to calculate percent difference")

    print()

In [None]:
X = merged_indices_and_covid['Total Cases'].values.reshape(-1,1)
y = merged_indices_and_covid['Close'].values
model = LinearRegression()
model.fit(X,y)

y_pred = model.predict(X)
r2 = r2_score(y, y_pred)

plt.figure(figsize=(10,10))
plt.scatter(merged_indices_and_covid['Total Cases'], merged_indices_and_covid['Close'], c='green', label='Total Cases')
plt.scatter(merged_indices_and_covid['Close'], merged_indices_and_covid['Close'], c='red', label='Close')

sns.regplot(x='Total Cases', y='Close', data=merged_indices_and_covid, scatter=False, color='blue')
plt.text(0.05, 0.95, f'$R^2 = {r2:.2f}$', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')

plt.xlabel('Total Cases')
plt.ylabel('Close')
plt.title('Linear Regression Between Stock Close and Total Cases')
plt.savefig('Linear Regression Between Stock Close and Total Cases.png', format='png', dpi=300 )
plt.legend()