In [1]:
import pandas as pd

In [6]:
mldf = pd.read_csv('machinelearning_data.csv')

In [4]:
mldf[mldf['Unnamed: 0'] == 31940]

Unnamed: 0.1,Unnamed: 0,juglandacae,s_name,distance_ft,resistance
31940,31940,Carya illinoinensis,Quercus virginiana,11795.727385,0


In [5]:
mldf.to_csv('machinelearning_data.csv', index=False, index_label=False)

In [34]:
# take a subset of dataset to use here
walnut_df = mldf[mldf['juglandacae'] != 'Juglans']

In [35]:
# drop useless columns
walnut_df = walnut_df.drop(columns='Unnamed: 0')

In [36]:
#how big is my dataset
len(walnut_df['juglandacae'])

58403394

In [37]:
# make X and Y for machine learning
X = walnut_df.drop(['resistance', 'juglandacae'], axis=1)
Y = walnut_df['resistance']

In [38]:
from sklearn.model_selection import train_test_split

# create a test-train split
X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size=0.2, random_state=13, stratify=Y)

In [39]:
# setup pipeline for linear regression cv
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegressionCV

In [40]:
# We create the preprocessing pipelines for both numeric and categorical data.
numeric_features = ['distance_ft']
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['s_name']
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

In [41]:
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
mllr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegressionCV())])

In [None]:
mllr.fit(X_train, y_train)



In [33]:
print(f"Training Data Score: {mllr.score(X_train, y_train)}")
print(f"Testing Data Score: {mllr.score(X_test, y_test)}")

Training Data Score: 1.0
Testing Data Score: 1.0
