In [3]:
#os.chdir("/Users/christianhenry/Dropbox/PostDoc/Data Incubator/Capstone/Coffee Quality/InputData")

In [8]:
#combinations of two
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

# List of file names
file_names = [
    "Aroma_Acidity_1.txt",
    "Aroma_Body_2.txt",
    "Aroma_Balance_3.txt",
    "Acidity_Body_4.txt",
    "Acidity_Balance_5.txt",
    "Body_Balance_6.txt"
]

# Initialize a list to store numeric features for each file
numeric_features_list = []

# Loop through the file names
for file_name in file_names:
    # Extract feature names from the file name
    features = file_name.split("_")[:2]
    numeric_features_list.append(features)

    # Read in the data from the file
    file_path = os.path.join(".", file_name)
    df = pd.read_table(file_path)

    # Separate features and target
    X = df.drop(columns=['Cupper.Points'])
    y = df['Cupper.Points']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Remove ".txt" extension from file name to use as prefix
    prefix = os.path.splitext(file_name)[0]
    
    # Save training and test data as individual CSV files
    X_train.to_csv(f"{prefix}_X_train.csv", index=False)
    X_test.to_csv(f"{prefix}_X_test.csv", index=False)
    y_train.to_csv(f"{prefix}_y_train.csv", index=False)
    y_test.to_csv(f"{prefix}_y_test.csv", index=False)

    # Define preprocessing steps for numerical and categorical features
    categorical_features = ['CountryOfOrigin']

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine the preprocessing steps for both types of features
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Create the regression model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
    # Save the fitted model as a pickle file
    model_pickle_filename = f"model_{file_name.split('.')[0]}.pkl"
    joblib.dump(model, model_pickle_filename)

    # Calculate the mean squared error to evaluate the model's performance
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error for {file_name}: {mse}")

Mean Squared Error for Aroma_Acidity_1.txt: 0.1330493619689108
Numeric Features for Aroma_Acidity_1.txt: ['Aroma', 'Acidity']
Mean Squared Error for Aroma_Body_2.txt: 0.14204047950661589
Numeric Features for Aroma_Body_2.txt: ['Aroma', 'Body']
Mean Squared Error for Aroma_Balance_3.txt: 0.12466474787169175
Numeric Features for Aroma_Balance_3.txt: ['Aroma', 'Balance']
Mean Squared Error for Acidity_Body_4.txt: 0.12837619024242403
Numeric Features for Acidity_Body_4.txt: ['Acidity', 'Body']
Mean Squared Error for Acidity_Balance_5.txt: 0.1192476942496461
Numeric Features for Acidity_Balance_5.txt: ['Acidity', 'Balance']
Mean Squared Error for Body_Balance_6.txt: 0.12211410308078723
Numeric Features for Body_Balance_6.txt: ['Body', 'Balance']


In [13]:
from sklearn.metrics import mean_absolute_error, r2_score
data = {
    'Balance': 7.6,
    'Acidity': 8,
    'Body': 7.3,
    'CountryOfOrigin': ['Country1']
}

# Create a DataFrame from the dictionary
df = pd.DataFrame(data)
model.predict(df)

mae = mean_absolute_error(y, y_pred)
print(f"Mean Absolute Error: {mae}")

# Calculate R-squared (Coefficient of Determination)
r_squared = r2_score(y, y_pred)
print(f"R-squared: {r_squared}")

ValueError: Found input variables with inconsistent numbers of samples: [1302, 261]

In [11]:
y

0       7.33
1       7.50
2       8.00
3       8.00
4       8.00
        ... 
1297    7.42
1298    8.00
1299    7.25
1300    7.25
1301    7.25
Name: Cupper.Points, Length: 1302, dtype: float64

In [9]:
#combinations of three
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import joblib

# List of file names
file_names = [
    "Aroma_Acidity_Body_1.txt",
    "Aroma_Acidity_Balance_2.txt",
    "Aroma_Body_Balance_3.txt",
    "Acidity_Body_Balance_4.txt"
]

# Loop through the file names
for file_name in file_names:
    # Extract feature names from the file name
    features = file_name.split("_")[:3]

    # Read in the data from the file
    file_path = os.path.join(".", file_name)
    df = pd.read_table(file_path)

    # Separate features and target
    X = df.drop(columns=['Cupper.Points'])
    y = df['Cupper.Points']

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Remove ".txt" extension from file name to use as prefix
    prefix = os.path.splitext(file_name)[0]
    
    # Save training and test data as individual CSV files
    X_train.to_csv(f"{prefix}_X_train.csv", index=False)
    X_test.to_csv(f"{prefix}_X_test.csv", index=False)
    y_train.to_csv(f"{prefix}_y_train.csv", index=False)
    y_test.to_csv(f"{prefix}_y_test.csv", index=False)
    
    # Define preprocessing steps for numerical and categorical features
    categorical_features = ['CountryOfOrigin']

    numeric_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    # Combine the preprocessing steps for both types of features
    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, features),
        ('cat', categorical_transformer, categorical_features)
    ])

    # Create the regression model
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    # Fit the model on the training data
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)
    
        # Save the fitted model as a pickle file
    model_pickle_filename = f"model_{file_name.split('.')[0]}.pkl"
    joblib.dump(model, model_pickle_filename)

    # Calculate the mean squared error to evaluate the model's performance
    mse = mean_squared_error(y_test, y_pred)
    print(f"Mean Squared Error for {file_name}: {mse}")

Mean Squared Error for Aroma_Acidity_Body_1.txt: 0.12888456241502974
Mean Squared Error for Aroma_Acidity_Balance_2.txt: 0.11923868808772951
Mean Squared Error for Aroma_Body_Balance_3.txt: 0.12183413463733961
Mean Squared Error for Acidity_Body_Balance_4.txt: 0.11637141314538521
