In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [3]:
# Load the dataset
data = pd.read_excel('investments.xlsx')
data = pd.DataFrame(data)
data

Unnamed: 0,investment,minimum_investment,risk,best_investment,liquidity,potential_return
0,Real Estate,1000000,High,1,Low,High
1,Gold,5000,Medium,1,Medium,Medium
2,Cryptocurrency,1000,High,1,Low,High
3,Bonds,1000,Low,1,High,Low
4,Mutual Funds,1000,Medium,1,High,Medium
5,Starting a Business,50000,High,0,Medium,High
6,Education,2000,Low,1,Low,Medium
7,Retirement,100000,Medium,1,Medium,High
8,Buying a Car For Rent,400000,Medium,0,Medium,Low
9,Collectibles,5000,High,1,Low,High


In [4]:
# Split features and target variable
X = data.drop(columns=["investment","best_investment"])
y = data["investment"]

In [120]:
X = X.values
y = y.values

In [5]:
# Preprocessing
numeric_features = ["minimum_investment"]
categorical_features = ["risk", "liquidity", "potential_return"]

numeric_transformer = Pipeline(steps=[("scaler", StandardScaler())])
categorical_transformer = Pipeline(steps=[("onehot", OneHotEncoder(handle_unknown="ignore"))])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

# Fit and transform the data using the preprocessor
X_preprocessed = preprocessor.fit_transform(data.drop(columns=["investment","best_investment"]))
print(X_preprocessed)

# Convert the preprocessed data into a DataFrame for better visualization
preprocessed_df = pd.DataFrame(X_preprocessed, columns=numeric_features + \
                                               list(preprocessor.named_transformers_['cat'] \
                                                    .named_steps['onehot'] \
                                                    .get_feature_names_out(categorical_features)))

# Display the preprocessed data
print(preprocessed_df.head())

[[ 0.87558386  1.          0.          0.          0.          1.
   0.          1.          0.          0.        ]
 [-0.28940481  0.          0.          1.          0.          0.
   1.          0.          0.          1.        ]
 [-0.29408818  1.          0.          0.          0.          1.
   0.          1.          0.          0.        ]
 [-0.29408818  0.          1.          0.          1.          0.
   0.          0.          1.          0.        ]
 [-0.29408818  0.          0.          1.          1.          0.
   0.          0.          0.          1.        ]
 [-0.23671688  1.          0.          0.          0.          0.
   1.          1.          0.          0.        ]
 [-0.29291734  0.          1.          0.          0.          1.
   0.          0.          0.          1.        ]
 [-0.17817474  0.          0.          1.          0.          0.
   1.          1.          0.          0.        ]
 [ 0.17307813  0.          0.          1.          0.          0

In [14]:
models = [
    ("Random Forest", RandomForestClassifier()),
    ("Support Vector Machine", SVC()),
    ("Logistic Regression", LogisticRegression())
]
import pickle

# Define the directory path and filename to save the model
model_filename = '../random_forest_model.pkl'
model_path = model_filename

for model_name, model in models:
    print("Training and testing", model_name)

    # Define the model
    model_pipeline = Pipeline(steps=[("preprocessor", preprocessor),
                                      ("classifier", model)])

    # Train the model on the entire dataset
    model_pipeline.fit(X, y)
    print(X)
    # Test the model on the same dataset
    y_pred = model_pipeline.predict(X)

    # Evaluate the model
    accuracy = accuracy_score(y, y_pred)
    print("Accuracy:", accuracy)
    print()

    # Save the Random Forest model
    if model_name == "Random Forest":
        with open(model_path, 'wb') as file:
            pickle.dump(model_pipeline, file)
        print("Random Forest model saved successfully.")

Training and testing Random Forest
    minimum_investment    risk liquidity potential_return
0              1000000    High       Low             High
1                 5000  Medium    Medium           Medium
2                 1000    High       Low             High
3                 1000     Low      High              Low
4                 1000  Medium      High           Medium
5                50000    High    Medium             High
6                 2000     Low       Low           Medium
7               100000  Medium    Medium             High
8               400000  Medium    Medium              Low
9                 5000    High       Low             High
10               10000    High       Low             High
11                5000     Low      High           Medium
12                1000    High       Low             High
13                5000    High    Medium             High
14              100000    High    Medium             High
15               10000     Low      H

In [15]:
# Example custom input data
custom_input = {
    'minimum_investment': [5000,1000000],  # Example numerical feature
    'risk': ['low','low'],                # Example categorical feature
    'liquidity': ['high','low'],          # Example categorical feature
    'potential_return': ['medium','high']  # Example categorical feature
}
custom_input = pd.DataFrame(custom_input)
custom_input

Unnamed: 0,minimum_investment,risk,liquidity,potential_return
0,5000,low,high,medium
1,1000000,low,low,high


In [16]:
# Load the Random Forest model from file
model_path = '../random_forest_model.pkl'
with open(model_path, 'rb') as file:
    random_forest_model = pickle.load(file)

array(['Collectibles', 'Real Estate'], dtype=object)

Model prediction: ['Savings Accounts']
