<a href="https://colab.research.google.com/github/h-wi/lg-aimers-hackathon/blob/main/%5BBaseline%5D_RandomForest_with_UFS_(0_52).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import

In [2]:
import pandas as pd
import random
import os
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(37) # Seed 고정

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Data Load

A. Y_Quality 무시하고 그냥 Y_Class 예측 모형 만들기

**B. Y_Quality를 예측하고 Spec 범위를 활용해서 Y_Class를 생성하기**

C. A.와 B.를 모두 활용하기



In [5]:
cd /content/drive/MyDrive/aimers

/content/drive/MyDrive/aimers


In [6]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [7]:
train_df['LINE'].unique()

array(['T050304', 'T050307', 'T100304', 'T100306', 'T010306', 'T010305'],
      dtype=object)

In [8]:
test_df['LINE'].unique()

array(['T100306', 'T100304', 'T010305', 'T010306', 'T050304', 'T050307'],
      dtype=object)

In [9]:
# 생산 LINE별로 쓰이는 변수가 다르므로 생산 LINE별로 DF 구분

train_grouped = train_df.groupby('LINE')
test_grouped = test_df.groupby('LINE')

for name, group in train_grouped:
    if name == 'T050304':
        t04_df = group.copy()
    elif name == 'T050307':
        t07_df = group.copy()
    elif name == 'T100304':
        t14_df = group.copy()
    elif name == 'T100306':
        t16_df = group.copy()
    elif name == 'T010306':
        t06_df = group.copy()  
    elif name == 'T010305':
        t05_df = group.copy()

for name, group in test_grouped:
    if name == 'T050304':
        test04_df = group.copy()
    elif name == 'T050307':
        test07_df = group.copy()
    elif name == 'T100304':
        test14_df = group.copy()
    elif name == 'T100306':
        test16_df = group.copy()
    elif name == 'T010306':
        test06_df = group.copy()  
    elif name == 'T010305':
        test05_df = group.copy()

In [10]:
# 각 df에서 값을 갖고 있지 않은 열(결측치만 갖고 있는 열) 모두 삭제
# df마다 적합할 모델 만들고 test df도 해당 모델에만 예측가능

t04_df = t04_df.dropna(axis=1, how='all')
t07_df = t07_df.dropna(axis=1, how='all')
t14_df = t14_df.dropna(axis=1, how='all')
t16_df = t16_df.dropna(axis=1, how='all')
t06_df = t06_df.dropna(axis=1, how='all')
t05_df = t05_df.dropna(axis=1, how='all')

test04_df = test04_df.dropna(axis=1, how='all')
test07_df = test07_df.dropna(axis=1, how='all')
test14_df = test14_df.dropna(axis=1, how='all')
test16_df = test16_df.dropna(axis=1, how='all')
test06_df = test06_df.dropna(axis=1, how='all')
test05_df = test05_df.dropna(axis=1, how='all')

In [11]:
# 각 LINE별로 잘 모였는지 확인, LINE별 생산하는 제품의 CODE 확인.

print(t04_df['LINE'].unique(),t04_df['PRODUCT_CODE'].unique())
print(t07_df['LINE'].unique(),t07_df['PRODUCT_CODE'].unique())
print(t14_df['LINE'].unique(),t14_df['PRODUCT_CODE'].unique())
print(t16_df['LINE'].unique(),t16_df['PRODUCT_CODE'].unique())
print(t06_df['LINE'].unique(),t06_df['PRODUCT_CODE'].unique())
print(t05_df['LINE'].unique(),t05_df['PRODUCT_CODE'].unique())

print('########################################')

print(test04_df['LINE'].unique(),test04_df['PRODUCT_CODE'].unique())
print(test07_df['LINE'].unique(),test07_df['PRODUCT_CODE'].unique())
print(test14_df['LINE'].unique(),test14_df['PRODUCT_CODE'].unique())
print(test16_df['LINE'].unique(),test16_df['PRODUCT_CODE'].unique())
print(test06_df['LINE'].unique(),test06_df['PRODUCT_CODE'].unique())
print(test05_df['LINE'].unique(),test05_df['PRODUCT_CODE'].unique())

['T050304'] ['A_31']
['T050307'] ['A_31']
['T100304'] ['T_31' 'O_31']
['T100306'] ['T_31' 'O_31']
['T010306'] ['A_31']
['T010305'] ['A_31']
########################################
['T050304'] ['A_31']
['T050307'] ['A_31']
['T100304'] ['T_31' 'O_31']
['T100306'] ['T_31' 'O_31']
['T010306'] ['A_31']
['T010305'] ['A_31']


## Data Pre-processing

In [12]:
# pandas 전체 행 출력 개수 제한
pd.options.display.max_rows = 5000

A. 각 LINE, PRODUCT_CODE별로 데이터를 분할하고 따로 모형 적합하기

**필요한 변수만 활용하기 때문에 메모리/연산 효율적
각 LINE, PRODUCT_CODE별 모형 최적화 가능**

### 1. Line **T050304(t04)** data preprocessing


#### Divide dataframe and Fill in missing values

Use it https://m.blog.naver.com/tjdrud1323/221720259834

In [13]:
train_x = t04_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t04_train_y = t04_df['Y_Class']

t04_test_x = test04_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [14]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(t04_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t04_test_x[i] = le.transform(t04_test_x[i]) 
print('Done.')

Done.


In [15]:
# train 결측값 확인하기

# train_x : 전체 행 78개 중 결측치

# X_2063 ~ X_2092 39개  # X_2100 ~ X_2407 39개  # X_2543 ~ X_2627 39개

train_x.isnull().sum()

LINE             0
PRODUCT_CODE     0
X_128            0
X_129            0
X_132            0
X_133            0
X_134            0
X_135            0
X_136            0
X_137            0
X_138            0
X_139            0
X_140            0
X_141            0
X_142            0
X_143            0
X_144            0
X_145            0
X_146            0
X_147            0
X_148            0
X_149            0
X_150            0
X_151            0
X_152            0
X_153            0
X_154            0
X_155            0
X_156            0
X_157            0
X_158            0
X_159            0
X_160            0
X_161            0
X_162            0
X_163            0
X_164            0
X_165            0
X_166            0
X_167            0
X_168            0
X_169            0
X_170            0
X_171            0
X_172            0
X_173            0
X_174            0
X_175            0
X_176            0
X_177            0
X_178            0
X_179            0
X_180       

In [16]:
# 일단 결측치 39개있는 열도 지우고 성능 별로면 넣어보기 : threshold 38

train_x = train_x.dropna(axis=1,thresh=len(train_x)-38)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t04_test_x.columns if col not in train_x.columns]

# Drop the columns
t04_test_x.drop(columns=cols_to_drop, inplace=True)

In [17]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. =>  별로면 KNN

train_median = train_x.median()

train_x = train_x.fillna(train_median)
t04_test_x = t04_test_x.fillna(train_median) # **test의 중앙값 사용하면 안됨!!(Data leackage)**

#### Debugging

In [18]:
########################### DEBUG ########################################

if (train_x.isnull().any().any() or t04_test_x.isnull().any().any()):
  print('missing value check again')

if (len(train_x.columns) != len(t04_test_x.columns)):
  print('column # check again')

#### Feature selection
Using PCA with StandardScaling

In [19]:
from sklearn.preprocessing import StandardScaler  # 표준화 패키지 라이브러리 

train_x = StandardScaler().fit_transform(train_x)

In [35]:
sum(pca.explained_variance_ratio_)

0.800346305862121

In [36]:
from sklearn.decomposition import PCA

pca = PCA(n_components=30) # 주성분을 몇개로 할지 결정
t04_train_new = pca.fit_transform(train_x)
t04_test_new = pca.transform(t04_test_x)



### 2. **Line T050307(t07)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
train_x = t07_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t07_train_y = t07_df['Y_Class']

t07_test_x = test07_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(t07_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t07_test_x[i] = le.transform(t07_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

# train_x : X_1167 ~ X_1171 28개, X_1274 ~ X_1278 28개, 
train_x.isnull().sum()

In [None]:
# column의 반 이상이 결측치면 자른다.
train_x = train_x.dropna(axis=1,thresh=len(train_x)/2)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t07_test_x.columns if col not in train_x.columns]

# Drop the columns
t07_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

train_median = train_x.median()
test_median = t07_test_x.median()

train_x = train_x.fillna(train_median)
t07_test_x = t07_test_x.fillna(test_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (train_x.isnull().any().any() or t07_test_x.isnull().any().any()):
  print('missing value check again')

if (len(train_x.columns) != len(t07_test_x.columns)):
  print('column # check again')

#### Feature selection
일단 지금은 UFS방식, 성능 올리려면 좀 더 복잡한 거 써야된다 !!

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# the number of top features to select. 
SELECT_FEATURE = 100

# Apply UFS to the training data
selector = SelectKBest(f_regression, k=SELECT_FEATURE)

# Apply the same UFS to the test data
t07_train_new = selector.fit_transform(train_x, t07_train_y)
t07_test_new = selector.transform(t07_test_x)

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2)
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


### 3. **Line T100304(t14)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
train_x = t14_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t14_train_y = t14_df['Y_Class']

t14_test_x = test14_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(t14_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t14_test_x[i] = le.transform(t14_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

train_x.isnull().sum()

LINE            0
PRODUCT_CODE    0
X_1             0
X_2             0
X_3             0
               ..
X_929           1
X_930           1
X_931           1
X_932           1
X_933           1
Length: 673, dtype: int64

In [None]:
train_x = train_x.dropna(axis=1,thresh=len(train_x)/2)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t14_test_x.columns if col not in train_x.columns]

# Drop the columns
t14_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

train_median = train_x.median()

train_x = train_x.fillna(train_median)
t14_test_x = t14_test_x.fillna(train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (train_x.isnull().any().any() or t14_test_x.isnull().any().any()):
  print('missing value check again')

if (len(train_x.columns) != len(t14_test_x.columns)):
  print('column # check again')

#### Feature selection
일단 지금은 UFS방식, 성능 올리려면 좀 더 복잡한 거 써야된다 !!

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# the number of top features to select. 
SELECT_FEATURE = 100

# Apply UFS to the training data
selector = SelectKBest(f_regression, k=SELECT_FEATURE)

# Apply the same UFS to the test data
t14_train_new = selector.fit_transform(train_x, t14_train_y)
t14_test_new = selector.transform(t14_test_x)

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2)
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


### 4. Line **T100306(t16)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
train_x = t16_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t16_train_y = t16_df['Y_Class']

t16_test_x = test16_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(t16_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t16_test_x[i] = le.transform(t16_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기
train_x.isnull().sum()

LINE            0
PRODUCT_CODE    0
X_1             0
X_2             0
X_3             0
               ..
X_929           0
X_930           0
X_931           0
X_932           0
X_933           0
Length: 673, dtype: int64

In [None]:
# 일단 결측치 39개있는 열도 지우고 성능 별로면 넣어보기

train_x = train_x.dropna(axis=1,thresh=len(train_x)/2)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t16_test_x.columns if col not in train_x.columns]

# Drop the columns
t16_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

train_median = train_x.median()

train_x = train_x.fillna(train_median)
t16_test_x = t16_test_x.fillna(train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (train_x.isnull().any().any() or t16_test_x.isnull().any().any()):
  print('missing value check again')

if (len(train_x.columns) != len(t16_test_x.columns)):
  print('column # check again')

#### Feature selection
일단 지금은 UFS방식, 성능 올리려면 좀 더 복잡한 거 써야된다 !!

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# the number of top features to select. 
SELECT_FEATURE = 100

# Apply UFS to the training data
selector = SelectKBest(f_regression, k=SELECT_FEATURE)

# Apply the same UFS to the test data
t16_train_new = selector.fit_transform(train_x, t16_train_y)
t16_test_new = selector.transform(t16_test_x)

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2)
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


### 5. Line **T010306(t06)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
train_x = t06_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t06_train_y = t06_df['Y_Class']

t06_test_x = test06_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(t06_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t06_test_x[i] = le.transform(t06_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

train_x.isnull().sum()

LINE            0
PRODUCT_CODE    0
X_246           0
X_247           0
X_248           0
               ..
X_2861          0
X_2862          0
X_2863          0
X_2864          0
X_2865          0
Length: 888, dtype: int64

In [None]:
# 일단 결측치 39개있는 열도 지우고 성능 별로면 넣어보기

train_x = train_x.dropna(axis=1,thresh=len(train_x)/2)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t06_test_x.columns if col not in train_x.columns]

# Drop the columns
t06_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

train_median = train_x.median()

train_x = train_x.fillna(train_median)
t06_test_x = t06_test_x.fillna(train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (train_x.isnull().any().any() or t06_test_x.isnull().any().any()):
  print('missing value check again')

if (len(train_x.columns) != len(t06_test_x.columns)):
  print('column # check again')

#### Feature selection
일단 지금은 UFS방식, 성능 올리려면 좀 더 복잡한 거 써야된다 !!

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# the number of top features to select. 
SELECT_FEATURE = 100

# Apply UFS to the training data
selector = SelectKBest(f_regression, k=SELECT_FEATURE)

# Apply the same UFS to the test data
t06_train_new = selector.fit_transform(train_x, t06_train_y)
t06_test_new = selector.transform(t06_test_x)

  X_norms = np.sqrt(row_norms(X.T, squared=True) - n_samples * X_means ** 2)
  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


### 6. Line **T010305(t05)** data preprocessing


#### Divide dataframe and Fill in missing values

In [None]:
train_x = t05_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
t05_train_y = t05_df['Y_Class']

t05_test_x = test05_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

In [None]:
# qualitative to quantitative
# 각 생산라인과 상품코드를 integer encoding

qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(t05_test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    t05_test_x[i] = le.transform(t05_test_x[i]) 
print('Done.')

Done.


In [None]:
# train 결측값 확인하기

train_x.isnull().sum()

LINE            0
PRODUCT_CODE    0
X_246           0
X_247           0
X_248           0
               ..
X_2861          0
X_2862          0
X_2863          0
X_2864          0
X_2865          0
Length: 888, dtype: int64

In [None]:
train_x = train_x.dropna(axis=1,thresh=len(train_x)/2)

# Columns in test data that are not present in train data
# train data에 맞춰서 test data의 column도 정리해주기.
cols_to_drop = [col for col in t05_test_x.columns if col not in train_x.columns]

# Drop the columns
t05_test_x.drop(columns=cols_to_drop, inplace=True)

In [None]:
# 결측치 threshold 안 넘는 열 채우기
# 일단 중앙값으로 해봅시다. 별로면 KNN

train_median = train_x.median()

train_x = train_x.fillna(train_median)
t05_test_x = t05_test_x.fillna(train_median)

#### Debugging

In [None]:
########################### DEBUG ########################################

if (train_x.isnull().any().any() or t05_test_x.isnull().any().any()):
  print('missing value check again')

if (len(train_x.columns) != len(t05_test_x.columns)):
  print('column # check again')

#### Feature selection
일단 지금은 UFS방식, 성능 올리려면 좀 더 복잡한 거 써야된다 !!

In [None]:
from sklearn.feature_selection import SelectKBest, f_regression

# the number of top features to select. 
SELECT_FEATURE = 100

# Apply UFS to the training data
selector = SelectKBest(f_regression, k=SELECT_FEATURE)

# Apply the same UFS to the test data
t05_train_new = selector.fit_transform(train_x, t05_train_y)
t05_test_new = selector.transform(t05_test_x)

  correlation_coefficient /= X_norms
  correlation_coefficient /= X_norms
  f_statistic = corr_coef_squared / (1 - corr_coef_squared) * deg_of_freedom


## Classification Model Fit

먼저 fit하고 XAI로 접근해서 X 변수 해석하기

In [None]:
RF04 = RandomForestClassifier(random_state=37).fit(t04_train_new, t04_train_y)
RF07 = RandomForestClassifier(random_state=37).fit(t07_train_new, t07_train_y)
RF14 = RandomForestClassifier(random_state=37).fit(t14_train_new, t14_train_y)
RF16 = RandomForestClassifier(random_state=37).fit(t16_train_new, t16_train_y)
RF06 = RandomForestClassifier(random_state=37).fit(t06_train_new, t06_train_y)
RF05 = RandomForestClassifier(random_state=37).fit(t05_train_new, t05_train_y)
print('Done.')

Done.


## Inference

Y_Quality 값으로 오차 보정하기.

In [None]:
t04_preds = RF04.predict(t04_test_new)
t07_preds = RF07.predict(t07_test_new)
t14_preds = RF14.predict(t14_test_new)
t16_preds = RF16.predict(t16_test_new)
t06_preds = RF06.predict(t06_test_new)
t05_preds = RF05.predict(t05_test_new)
print('Done.')

Done.


## Submit

In [None]:
submit = pd.read_csv('./sample_submission.csv')

In [None]:
submit['Y_Class'].iloc[t05_test_x.index[0]]

0

In [None]:
# 각 데이터에 맞는 위치로 예측값 할당.
for i in range(len(t04_test_x.index)):
  submit['Y_Class'].iloc[t04_test_x.index[i]] = t04_preds[i]

for i in range(len(t07_test_x.index)):
  submit['Y_Class'].iloc[t07_test_x.index[i]] = t07_preds[i]

for i in range(len(t14_test_x.index)):
  submit['Y_Class'].iloc[t14_test_x.index[i]] = t14_preds[i]

for i in range(len(t16_test_x.index)):
  submit['Y_Class'].iloc[t16_test_x.index[i]] = t16_preds[i]

for i in range(len(t06_test_x.index)):
  submit['Y_Class'].iloc[t06_test_x.index[i]] = t06_preds[i]

for i in range(len(t05_test_x.index)):
  submit['Y_Class'].iloc[t05_test_x.index[i]] = t05_preds[i]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
submit.to_csv('./baseline_submission.csv', index=False)