In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn import model_selection, preprocessing, feature_selection, ensemble, linear_model, metrics, decomposition
from lime import lime_tabular

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [6]:
# Import and Read Data
train = pd.read_csv('/kaggle/input/coms4771-spring-2022-regression-competition/train_examples.csv')
labels = pd.read_csv('/kaggle/input/coms4771-spring-2022-regression-competition/train_labels.csv')
train.head()

In [7]:
# Convert all columns to intergers
train['feature_2'] = train['feature_2'].astype(int)
train['feature_5'] = train['feature_2'].astype(int)
train['feature_6'] = train['feature_2'].astype(int)
train['feature_7'] = train['feature_2'].astype(int)
train['feature_8'] = train['feature_2'].astype(int)
train['feature_9'] = train['feature_2'].astype(int)
train['feature_10'] = train['feature_2'].astype(int)
train

In [8]:
# Converting date and time to columns of month, day, hour, minute, seconds so that all features can be integers
import datetime
from datetime import datetime
train['date'] = train['feature_0'].apply(lambda x: datetime.strptime (x,"%m-%d %H:%M:%S"))

In [9]:
train['month'] = pd.to_datetime(train['date']).dt.month
train['day'] = pd.to_datetime(train['date']).dt.day
train['hour'] = pd.to_datetime(train['date']).dt.hour
train['minute'] = pd.to_datetime(train['date']).dt.minute
train['seconds'] = pd.to_datetime(train['date']).dt.second

train.drop(['feature_0', 'date'], axis = 1, inplace=True)
train.head()


In [10]:
# selecting the features and the labels
x = train.loc[:, train.columns != 'id'].values
labels['duration'] = labels['duration'].astype(int)
y = labels['duration']

In [11]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=101)


In [12]:
# standardize the data
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()

# fit on training set only.
scaler.fit(x_train)

# apply transform to both the training set and the test set.
train_scaled = scaler.transform(x_train)
test_scaled = scaler.transform(x_test)

pca = PCA(n_components=2)
pca.fit(train_scaled)

train_pca = pca.transform(train_scaled)
test_pca = pca.transform(test_scaled)

In [13]:
# implementing the linear regression model
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(train_pca,y_train)

In [14]:
# print the intercept
print(lm.intercept_)

In [15]:
# get predictions and compare from the original labels and the predicted labels
predictions = lm.predict(test_pca)
plt.scatter(y_test,predictions);

In [16]:
sns.distplot((y_test-predictions),bins=50);

In [17]:
# calculate errors
from sklearn import metrics
print('MAE:', metrics.mean_absolute_error(y_test, predictions))
print('MSE:', metrics.mean_squared_error(y_test, predictions))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, predictions)))

In [18]:
# import the test_example data that we are going to implement our model on
test_train = pd.read_csv('/kaggle/input/coms4771-spring-2022-regression-competition/test_examples.csv')
test_train.head()

In [19]:
# clean the data the same way we did for the train data
test_train['feature_2'] = test_train['feature_2'].astype(int)
test_train['feature_5'] = test_train['feature_2'].astype(int)
test_train['feature_6'] = test_train['feature_2'].astype(int)
test_train['feature_7'] = test_train['feature_2'].astype(int)
test_train['feature_8'] = test_train['feature_2'].astype(int)
test_train['feature_9'] = test_train['feature_2'].astype(int)
test_train['feature_10'] = test_train['feature_2'].astype(int)

test_train['date'] = test_train['feature_0'].apply(lambda x: datetime.strptime (x,"%m-%d %H:%M:%S"))
test_train['month'] = pd.to_datetime(test_train['date']).dt.month
test_train['day'] = pd.to_datetime(test_train['date']).dt.day
test_train['hour'] = pd.to_datetime(test_train['date']).dt.hour
test_train['minute'] = pd.to_datetime(test_train['date']).dt.minute
test_train['seconds'] = pd.to_datetime(test_train['date']).dt.second

test_train.drop(['feature_0', 'date'], axis = 1, inplace=True)
test_train.head()

In [20]:
# apply standardizing and pca on the new data, create predictions
x_testing = test_train.loc[:, test_train.columns != 'id'].values

scaler = StandardScaler()

# Fit on training set only.
scaler.fit(x_testing)

# Apply transform to both the training set and the test set.
x_testing = scaler.transform(x_testing)

pca2 = PCA(n_components=2)
pca2.fit(x_testing)

x_testing = pca.transform(x_testing)
predictions_testing = lm.predict(x_testing)

In [21]:
# create a new column for the output of the new data
test_train['duration'] = predictions_testing
test_train.head()

In [22]:
# convert duration to int
test_train['duration'] = test_train['duration'].astype(int)

In [27]:
# drop all features and leave just the 'id' and 'duration'
test_train.drop(['feature_1', 'feature_2', 'feature_3', 'feature_4', 'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9', 'feature_10', 'month', 'day', 'hour', 'minute', 'seconds'], axis = 1, inplace=True)
test_train.head()

In [30]:
# export to submit to Kaggle and see the results
test_train.to_csv('test_labels.csv',index=False)