In [55]:
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_diabetes
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, mean_squared_error

In [8]:
diabetes = load_diabetes()
df2 = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df2['class'] = diabetes.target

In [78]:
X = df2.drop('class', axis=1)
y = df2['class']
scaler = StandardScaler()
X = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.3)
rfc = RandomForestClassifier(max_depth=5, random_state=42).fit(X_train, y_train)
grad = GradientBoostingClassifier().fit(X_train, y_train)

reg = RandomForestRegressor().fit(X_train, y_train)

rfc_pred = rfc.predict(X_test)
reg_pred = reg.predict(X_test)
grad_pred = grad.predict(X_test)

rfc_score = accuracy_score(y_test, rfc_pred)
grad_score = accuracy_score(y_test, grad_pred)
reg_score = mean_squared_error(y_test, reg_pred)

# print('Classifier Accuracy:', clf_score)
# print('Mean Squared Error:', reg_score)

In [86]:
test = load_diabetes()
test_X = test.data
test_y = test.target

In [87]:
test_X

array([[ 0.03807591,  0.05068012,  0.06169621, ..., -0.00259226,
         0.01990749, -0.01764613],
       [-0.00188202, -0.04464164, -0.05147406, ..., -0.03949338,
        -0.06833155, -0.09220405],
       [ 0.08529891,  0.05068012,  0.04445121, ..., -0.00259226,
         0.00286131, -0.02593034],
       ...,
       [ 0.04170844,  0.05068012, -0.01590626, ..., -0.01107952,
        -0.04688253,  0.01549073],
       [-0.04547248, -0.04464164,  0.03906215, ...,  0.02655962,
         0.04452873, -0.02593034],
       [-0.04547248, -0.04464164, -0.0730303 , ..., -0.03949338,
        -0.00422151,  0.00306441]])

In [88]:
test_X = scaler.fit_transform(test_X)
test_X

array([[ 0.80050009,  1.06548848,  1.29708846, ..., -0.05449919,
         0.41853093, -0.37098854],
       [-0.03956713, -0.93853666, -1.08218016, ..., -0.83030083,
        -1.43658851, -1.93847913],
       [ 1.79330681,  1.06548848,  0.93453324, ..., -0.05449919,
         0.06015558, -0.54515416],
       ...,
       [ 0.87686984,  1.06548848, -0.33441002, ..., -0.23293356,
        -0.98564884,  0.32567395],
       [-0.9560041 , -0.93853666,  0.82123474, ...,  0.55838411,
         0.93616291, -0.54515416],
       [-0.9560041 , -0.93853666, -1.53537419, ..., -0.83030083,
        -0.08875225,  0.06442552]])

In [92]:
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from sklearn.metrics import jaccard_score
# jaccard_sim = jaccard_score(X, test_X)
similarities = cosine_similarity(X, test_X)
distances = euclidean_distances(X, test_X)
# print("Jaccard Similarity:", jaccard_sim)
print("Cosine Similarities:\n", similarities)

Cosine Similarities:
 [[ 1.         -0.41296315  0.88156583 ...  0.48051414 -0.0242446
  -0.80553914]
 [-0.41296315  1.         -0.1734083  ... -0.0658949  -0.25859409
   0.55812182]
 [ 0.88156583 -0.1734083   1.         ...  0.61244168 -0.30569363
  -0.63106326]
 ...
 [ 0.48051414 -0.0658949   0.61244168 ...  1.         -0.76887196
  -0.50031142]
 [-0.0242446  -0.25859409 -0.30569363 ... -0.76887196  1.
  -0.14226865]
 [-0.80553914  0.55812182 -0.63106326 ... -0.50031142 -0.14226865
   1.        ]]


In [97]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR

# Load the Diabetes dataset
diabetes = load_diabetes()
X_diabetes, y_diabetes = diabetes.data, diabetes.target

# List of models to try
models = {
    'LinearRegression': LinearRegression(),
    'RandomForestRegressor': RandomForestRegressor(),
    'SVR': SVR()
}

# Evaluate models using cross-validation
for model_name, model in models.items():
    scores = cross_val_score(model, X_diabetes, y_diabetes, cv=5)
    print(f'{model_name} - Mean R-squared: {scores.mean():.3f}')

# Recommend the model with the highest mean R-squared
best_model_diabetes = max(models, key=lambda k: cross_val_score(models[k], X_diabetes, y_diabetes, cv=5).mean())
print(f'Recommended model for Diabetes dataset: {best_model_diabetes}')


LinearRegression - Mean R-squared: 0.482
RandomForestRegressor - Mean R-squared: 0.409
SVR - Mean R-squared: 0.147
Recommended model for Diabetes dataset: LinearRegression


In [79]:
print('Classifier Accuracy:', rfc_score)
print('Classifier Accuracy:', grad_score)

Classifier Accuracy: 0.007518796992481203
Classifier Accuracy: 0.0


In [77]:
max_score = {"RandomForestRegressor":rfc_score, "GradientBoostingClassifier":grad_score}
max_key = max(max_score, key=max_score.get)
max_value = max_score[max_key]
best_df = pd.DataFrame([{'data': 'iris', max_key:max_value}])
best_df

Unnamed: 0,data,RandomForestRegressor
0,iris,0.007519


In [76]:
best_df = pd.DataFrame([{'data': 'iris', max_key:max_value}])
best_df

Unnamed: 0,data,RandomForestRegressor
0,iris,0.007519


In [72]:
result_df = pd.DataFrame([{"RandomForestClassifier": rfc_score, "GradientBoostingClassifier":grad_score, max_key:max_value}], index=['iris'])
result_df 

Unnamed: 0,RandomForestClassifier,GradientBoostingClassifier,RandomForestRegressor
iris,0.007519,0.0,0.007519


In [53]:
result_df2 = pd.DataFrame([{"RandomForestRegressor": reg_score}], index=['iris'])
result_df2

Unnamed: 0,RandomForestRegressor
iris,2945.32618


In [45]:
clf.score(X_train, y_train)

0.7022653721682848

In [None]:
clf_score = accuracy_score(y_test, clf_pred)
reg_score = accuracy_score(y_test, reg_pred)

In [18]:
df = pd.DataFrame(columns=["RandomForestClassifier", "RandomForestRegressor"])
df2 = pd.DataFrame([{"RandomForestClassifier": 10, "RandomForestRegressor": 10}])
new_df = pd.concat([df, df2], ignore_index=True)
new_df

Unnamed: 0,RandomForestClassifier,RandomForestRegressor
0,10,10


In [23]:
pd.DataFrame([{1:1, 2:2}])

Unnamed: 0,1,2
0,1,2


In [31]:
diabetes = load_diabetes()
df2 = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df2['class'] = diabetes.target

In [34]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
 10  class   442 non-null    float64
dtypes: float64(11)
memory usage: 38.1 KB


In [7]:
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['class'] = iris.target

In [3]:
data_path = '../Test_01.csv'

data_type = ''
if os.path.splitext(data_path)[-1] == '.csv':
    data_type = 'Structured Data'

In [None]:
if data_type == 'Structured Data':
    data = pd.read_csv(data_path)

In [11]:
df['class'].value_counts()

0    50
1    50
2    50
Name: class, dtype: int64

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   sepal length (cm)  150 non-null    float64
 1   sepal width (cm)   150 non-null    float64
 2   petal length (cm)  150 non-null    float64
 3   petal width (cm)   150 non-null    float64
 4   class              150 non-null    int32  
dtypes: float64(4), int32(1)
memory usage: 5.4 KB


In [50]:

target_label = 'class'
categorical_features = df.select_dtypes(include=['object']) 
if len(df[target_label].unique()) > 2 and categorical_features.empty == True:
    df_excluded = df.drop(columns=target_label)
    numeric_features = df_excluded.select_dtypes(include=['float64', 'int64', 'int32'])
    scaler = StandardScaler()
    numeric_features = scaler.fit_transform(numeric_features)

In [None]:
label_num = len(df[target_label].unique())
# categorical_features = np.ravel(categorical_features) # select_dtypes를 출력하면 DataFrame로 나오기 때문에, np.ravel을 사용하여 np.array 형태로 변환 

if categorical_features.empty == False:
    for col in categorical_features.columns:
        if categorical_features[col].str.match(r'\d{4}-\d{2}-\d{2}').all() == True or categorical_features[col].str.match(r'\d{4}:\d{2}:\d{2}').all() == True:
            categorical_features[col] = pd.to_datetime(categorical_features[col])
            categorical_features['year'] = categorical_features[col].dt.year
            categorical_features['month'] = categorical_features[col].dt.month
            categorical_features['day'] = categorical_features[col].dt.day
            categorical_features['hour'] = categorical_features[col].dt.hour
            categorical_features['minute'] = categorical_features[col].dt.minute
        else:
            encoder = LabelEncoder()
            categorical_features[col] = encoder.fit_transform(categorical_features[col])

    datetime_columns = categorical_features.select_dtypes('datetime64')
    # datetime64 컬럼을 삭제
    categorical_features = categorical_features.drop(columns=datetime_columns.columns)
    # final_df = pd.concat([pd.DataFrame(numeric_features), pd.DataFrame(categorical_features)], axis=1)
    # print(final_df)
    print(categorical_features)
    X_train, X_test, y_train, y_test = train_test_split(numeric_features, categorical_features, test_size=0.3, shuffle=True, random_state=42)