**Decision Tree -- Theory**

**Random Forest -- Theory**

**Implementation**

In [None]:
# Install Dependencies
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, recall_score, f1_score, mean_squared_error, r2_score

The next step is making the data. We will use two datasets for this project.

In [None]:
cancer_df = pd.read_csv('model_dev_data/breast_cancer.csv')
cancer_df.pop('ID')
solar_df = pd.read_csv('model_dev_data/solar.csv')

cancer_y = cancer_df.pop('diagnosis')
solar_y = solar_df.pop('SOLARRADIATION_0003')
solar_df = solar_df.iloc[:,1:]

def scale(df):
    x = df.values
    scaler = MinMaxScaler()
    x_scaled = scaler.fit_transform(x)
    return pd.DataFrame(x_scaled)

In [None]:
def makeModel(num_features, model, df, y, clf):
    # Restrict to only features with highest variance and make model
    cols = df.var().sort_values()[(-num_features - 1):].keys()
    df = df[df.columns.intersection(cols)]
    df = scale(df)
    training_x, testing_x, training_y, testing_y = train_test_split(df, y, test_size=0.2)
    model.fit(training_x, training_y)
    pred = model.predict(testing_x)
    print(str(model) + " with " + str(num_features) + " features: ")
    if clf:       
        accuracy = accuracy_score(testing_y, pred)
        recall = recall_score(testing_y, pred, pos_label = 'M') #Double Check? 
        f1 = f1_score(testing_y, pred, pos_label = 'M')
        print ("\tAccuracy of " + str(accuracy))
        print ("\tRecall of " + str(recall))
        print ("\tF1 of " + str(f1))
    else:
       mse = mean_squared_error(testing_y, pred)
       r2 = r2_score(testing_y, pred)
       print("\tMean Squared Error of " + str(mse))
       print("\tR2 Score of " + str(r2))

In [None]:
classifiers = [DecisionTreeClassifier(), LogisticRegression(), RandomForestClassifier()]
regressors = [DecisionTreeRegressor(), LinearRegression(), RandomForestRegressor()]
for ftrs in range(5, 20, 5):
    for clf in classifiers:
        makeModel(ftrs, clf, cancer_df, cancer_y, True)
    for reg in regressors:
        makeModel(ftrs, reg, solar_df, solar_y, False)