# Urban Air Pollution Prediction

In [None]:
# Dataframe and plotting
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Machine learning
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# Maths
from statistics import mean

# extra
import os


# load files into a pandas DataFrame
train = pd.read_csv("Train.csv")
test = pd.read_csv('Test.csv')
ss = pd.read_csv('SampleSubmission.csv')

# Let's observe the shape of our datasets.
print('Train datashape: ', train.shape)
print('Test Data Shape: ', test.shape)
print('Submission Data Shape: ', ss.shape)


def wrangle(filepath):
    print("\n==== WRANGLE WRANGLING ====")
    try:
        df = pd.read_csv(filepath)
    except:
        print("Error importing file")
        
    else:
        filename = os.path.basename(filepath)
        print("Dealing with file: ", filename)
        
        # Drop columns that are more than half NaN values
        print('Before drop NaN:', df.shape)
        df = df.dropna(thresh=len(df.index)/2, axis=1)
        print('After dropping NaN:', df.shape)

        # Convert date column to timestamps
        df['Date'] = pd.to_datetime(df['Date'])
        df['Date'] = df['Date'].astype('int64') # 10**9
        
        if (filepath == 'Train.csv'):
            try:
                # Remove outliers with readings above 500
                df = df[df["target"] < 500]
                
                # Drop leaky columns
                df.drop(columns=['target_min', 'target_max', 'target_variance', 'target_count'], inplace=True)
                print("Dropped Leaky columns", df.shape)
            finally:
                print("Done removing outliers and leaky columns")
        print("==== WRANGLE WRANGLED ====\n")
    return df

# Function call
train = wrangle('Train.csv')
test = wrangle('Test.csv')

# Spliting
target = "target"
y_train = train[target]
processed_train = train.drop(columns=[target, 'Place_ID X Date', 'Date', 'Place_ID'])

# Split train_data
X_train, X_val, y_train, y_val = train_test_split(processed_train, y_train, test_size = 0.1, random_state=42)

# Baseline
y_mean = mean(y_train)

y_pred_baseline = [y_mean] * len(y_train)
mae_baseline = mean_absolute_error(y_train, y_pred_baseline)

print("Mean P2 Reading:", round(y_train.mean(), 6))
print("Baseline MAE:", round(mae_baseline, 6))

# Imputation
# create an instance of SimpleImputer
imputer = SimpleImputer(strategy='mean')

# impute missing values in X
X_train = imputer.fit_transform(X_train)
X_val = imputer.fit_transform(X_val)
test = imputer.fit_transform(test.drop(columns=['Place_ID X Date', 'Date', 'Place_ID']))

# Iterate
model = LinearRegression()
model.fit(X_train, y_train)

# Evaluate the model
training_mae = mean_absolute_error(y_train, model.predict(X_train))
test_mae = mean_absolute_error(y_val, model.predict(X_val))
print("Training MAE:", round(training_mae, 6))
print("Test MAE:", round(test_mae, 6))

# Make predicitions
test_predictions = model.predict(test)

submission = pd.DataFrame({
    "Place_ID X Date": ss["Place_ID X Date"],
    "target": test_predictions
})
print('The submsission Sample: \n', submission.head())
submission.info()

