In [None]:
#Import the relevant libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.linear_model import LinearRegression

#Read from the csv file using Pandas
df=pd.read_csv('National_River_Toxin_Dataset_1.csv')

#Displays column headings,info about the csv, shape of the dataset and brief statistical summary 
print(df.head())
print(df.info())
print(df.shape)
print(df.describe())

#Tidy the dataset by removing duplicates and filling in blanks 
print(df.isnull().sum())
numeric_columns = df.select_dtypes(include=[np.number]).columns
df[numeric_columns]=df[numeric_columns].fillna(df[numeric_columns].mean())

#Converting dates to datetime
df['Date']=pd.to_datetime(df['Date'])

In [None]:
#Calculate the average toxin levels and visualisation

# Compute the average toxins, including Dissolved Oxygen
avg_toxins = df.groupby('River_System')[['Lead', 'Mercury', 'Arsenic']].mean().reset_index()

# Print the first 5 rows of avg_toxins
print(f"Average toxin levels:\n{avg_toxins}")

In [None]:
#Identify top polluted rivers by Lead
lead_pollution= df.groupby('River_System')['Lead'].mean().sort_values(ascending=False)
print(f"Top polluted rivers by Lead: \n{lead_pollution.head()}")

plt.figure(figsize=(10,6))
lead_pollution.plot(kind='bar', color='skyblue')
plt.title('Average Lead levels by River System')
plt.xlabel('River System')
plt.ylabel('Toxin Level')
plt.show()

In [None]:
#Line graph of lead levels over time
plt.figure(figsize=(10,6))
sns.lineplot(x='Date', y='Lead', data=df, hue='River_System', errorbar=None)
errorbar=None
plt.title('Lead Levels Over Time')
plt.xlabel('Date')
plt.ylabel('Lead Level')
plt.show()


In [None]:
#Identify top polluted rivers by Arsenic
arsenic_pollution= df.groupby('River_System')['Arsenic'].mean().sort_values(ascending=False)
print(f"Top polluted rivers by Arsenic: \n{arsenic_pollution.head()}")

plt.figure(figsize=(10,6))
arsenic_pollution.plot(kind='bar', color='skyblue')
plt.title('Average Arsenic levels by River System')
plt.xlabel('River System')
plt.ylabel('Toxin Level')
plt.show()

In [None]:
#Line graph of arsenic levels over time
plt.figure(figsize=(10,6))
sns.lineplot(x='Date', y='Arsenic', data=df, hue='River_System', errorbar=None)
errorbar=None
plt.title('Arsenic Levels Over Time')
plt.xlabel('Date')
plt.ylabel('Arsenic Level')
plt.show()

In [None]:
#Identify top polluted rivers by Mercury
mercury_pollution= df.groupby('River_System')['Mercury'].mean().sort_values(ascending=False)
print(f"Top polluted rivers by Mercury: \n{mercury_pollution.head()}")

plt.figure(figsize=(10,6))
mercury_pollution.plot(kind='bar', color='skyblue')
plt.title('Average Mercury levels by River System')
plt.xlabel('River System')
plt.ylabel('Toxin Level')
plt.show()

In [None]:
#Line graph of Mercury levels over time
plt.figure(figsize=(10,6))
sns.lineplot(x='Date', y='Mercury', data=df, hue='River_System', errorbar=None)
errorbar=None
plt.title('Mercury Levels Over Time')
plt.xlabel('Date')
plt.ylabel('Mercury Level')
plt.show()

In [None]:
#Compute the correlation matrix using only numeric columns
corr_matrix=df[numeric_columns].corr()

#Heatmap for correlations between different parameters
plt.figure(figsize=(10,6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()

In [None]:
#T test between two river systems
amazon=df[df['River_System']=='Amazon']['Lead']
yangtze=df[df['River_System']=='Yangtze']['Lead']
t_stat, p_value= stats.ttest_ind(amazon, yangtze)
print(f"T test between Amazon and Yangtze for Lead: T statistic = {t_stat}, P-value = {p_value}")

amazon_mercury=df[df['River_System']=='Amazon']['Mercury']
yangtze_mercury=df[df['River_System']=='Yangtze']['Mercury']
t_stat2, p_value2= stats.ttest_ind(amazon_mercury, yangtze_mercury)
print(f"T test between Amazon and Yangtze for Mercury: T statistic = {t_stat2}, P-value = {p_value2}")

amazon_arsenic=df[df['River_System']=='Amazon']['Arsenic']
yangtze_arsenic=df[df['River_System']=='Yangtze']['Arsenic']
t_stat3, p_value3= stats.ttest_ind(amazon_arsenic, yangtze_arsenic)
print(f"T test between Amazon and Yangtze for Arsenic: T statistic = {t_stat3}, P-value = {p_value3}")


In [None]:
#Linear regression between toxinn levels and pH values
x=df[['pH_Level']]
y=df['Lead']
model= LinearRegression()
model.fit(x,y)
print(f"Linear Regression Coefficients for Lead: {model.coef_}")
print(f"Linear Regression Intercept for Lead: {model.intercept_}")

X=df[['pH_Level']]
Y=df['Arsenic']
model= LinearRegression()
model.fit(X,Y)
print(f"Linear Regression Coefficients for Arsenic: {model.coef_}")
print(f"Linear Regression Intercept for Arsenic: {model.intercept_}")

a=df[['pH_Level']]
b=df['Mercury']
model= LinearRegression()
model.fit(a,b)
print(f"Linear Regression Coefficients for Mercury: {model.coef_}")
print(f"Linear Regression Intercept for Mercury: {model.intercept_}")