# Mars weather dataset
dataset tomado de: https://data.world/the-pudding/mars-weather

# Clasificación y reconocimiento de patrones
## Universidad Nacional de Colombia
## Tarea #2

## Diego Fernando Uribe
## Daniel Padierna

In [57]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

dataset = "dataset/mars-weather.csv"

df = pd.read_csv(dataset, index_col="id")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1894 entries, 1895 to 1
Data columns (total 9 columns):
terrestrial_date    1894 non-null object
sol                 1894 non-null int64
ls                  1894 non-null int64
month               1894 non-null object
min_temp            1867 non-null float64
max_temp            1867 non-null float64
pressure            1867 non-null float64
wind_speed          0 non-null float64
atmo_opacity        1894 non-null object
dtypes: float64(4), int64(2), object(3)
memory usage: 148.0+ KB


Eliminamos las entradas con características nulas

In [2]:
del df["wind_speed"]
df.dropna(inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1867 entries, 1895 to 2
Data columns (total 8 columns):
terrestrial_date    1867 non-null object
sol                 1867 non-null int64
ls                  1867 non-null int64
month               1867 non-null object
min_temp            1867 non-null float64
max_temp            1867 non-null float64
pressure            1867 non-null float64
atmo_opacity        1867 non-null object
dtypes: float64(3), int64(2), object(3)
memory usage: 131.3+ KB


In [3]:
df.head()

Unnamed: 0_level_0,terrestrial_date,sol,ls,month,min_temp,max_temp,pressure,atmo_opacity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1895,2018-02-27,1977,135,Month 5,-77.0,-10.0,727.0,Sunny
1893,2018-02-26,1976,135,Month 5,-77.0,-10.0,728.0,Sunny
1894,2018-02-25,1975,134,Month 5,-76.0,-16.0,729.0,Sunny
1892,2018-02-24,1974,134,Month 5,-77.0,-13.0,729.0,Sunny
1889,2018-02-23,1973,133,Month 5,-78.0,-18.0,730.0,Sunny


In [4]:
print(set(df["month"]))
print(set(df["atmo_opacity"]))

{'Month 12', 'Month 9', 'Month 7', 'Month 6', 'Month 1', 'Month 10', 'Month 3', 'Month 5', 'Month 2', 'Month 11', 'Month 8', 'Month 4'}
{'Sunny'}


In [5]:
df["month"] = df["month"].apply(lambda _: int(_.split()[-1]))
df.head()

Unnamed: 0_level_0,terrestrial_date,sol,ls,month,min_temp,max_temp,pressure,atmo_opacity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1895,2018-02-27,1977,135,5,-77.0,-10.0,727.0,Sunny
1893,2018-02-26,1976,135,5,-77.0,-10.0,728.0,Sunny
1894,2018-02-25,1975,134,5,-76.0,-16.0,729.0,Sunny
1892,2018-02-24,1974,134,5,-77.0,-13.0,729.0,Sunny
1889,2018-02-23,1973,133,5,-78.0,-18.0,730.0,Sunny


In [6]:
print(len(df))
df.drop_duplicates(inplace=True)
print(len(df))

1867
1867


In [7]:
df["terrestrial_date"] = pd.to_datetime(df["terrestrial_date"])


# Extracción de características
<hr>

## Filtro

In [35]:
from sklearn.feature_selection import VarianceThreshold

In [46]:
X = df.drop(["max_temp", "terrestrial_date", "atmo_opacity"], axis=1, errors="ignore")
y = df["max_temp"]

In [47]:
np.var(df)

sol         319440.723490
ls           11145.208348
month           12.288905
min_temp        30.278866
max_temp       114.417004
pressure      2941.835974
dtype: float64

In [61]:
selector = VarianceThreshold(threshold=31)
selector.fit_transform(X)
index_f = selector.get_support()

print(list(X.columns[index_f]))

['sol', 'ls', 'pressure']


## Wrapper

In [62]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel

In [63]:
model = LinearRegression()
rfe = RFE(estimator=model, n_features_to_select=3, step=1)
rfe.fit(X, y)
index_w = np.array(rfe.ranking_) == 1

In [64]:
#Xw_new = X[X.columns[index_w]]
list(Xw_new.columns.values)

['ls', 'month', 'min_temp']

# Embedded

In [65]:
predictor = ExtraTreesClassifier()
predictor.fit(X, y)

model = SelectFromModel(predictor, threshold=0.2, prefit=True)
#Xe_new = model.transform(X)


index_e = model.get_support()
print(list(X.columns.values[index_e]))

['sol', 'ls', 'pressure']


# Evaluación

In [66]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [69]:
linear_w = LinearRegression()
linear_e = LinearRegression()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, shuffle=True, random_state=1)

Xw_train = X_train[X_train.columns[index_w]]
Xe_train = X_train[X_train.columns[index_e]]

Xw_test = X_test[X_test.columns[index_w]]
Xe_test = X_test[X_test.columns[index_e]]

In [70]:
linear_w.fit(Xw_train, y_train)
linear_e.fit(Xe_train, y_train)

yw_pred = linear_w.predict(Xw_test)
ye_pred = linear_e.predict(Xe_test)

print(mean_squared_error(yw_pred, y_test))
print(mean_squared_error(ye_pred, y_test))

29.082738797640015
53.887641212301325
