In [None]:
!pip install efficient-apriori

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting efficient-apriori
  Downloading efficient_apriori-2.0.3-py3-none-any.whl (14 kB)
Installing collected packages: efficient-apriori
Successfully installed efficient-apriori-2.0.3


In [None]:
# Imports
## Base/Default Libraries:
import pandas as pd
import numpy as np
import efficient_apriori as ea
from scipy import stats

## Utilities:
from numbers import Number
from google.colab import files
import io
import math


## One-Hot Encoder:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

## Model:
import xgboost as xgb

## Pickle & Saving objects onto memory:
import pickle

## Testing:
# from sklearn.model_selection import train_test_split
from pathlib import Path 

In [None]:
def who_is_applying(data: pd.DataFrame, rule: ea.Rule):
  pass
  # df.drop_duplicates(inplace=True)

In [None]:
def removeUniqueColumns(data: pd.DataFrame, percentage: float) -> tuple[pd.DataFrame, list]
  rows_len = len(data.index)
  columns_removed = []
  for c in data.columns:
    unique_values = data[c].nunique()
    if unique_values >= percentage * rows_len: # Meaning, %percentage of the rows having unique values.
      columns_removed.append(c)
  data = data.drop(columns_removed, axis=1) # Dropping that data, because it's too unique-y.

In [None]:
def getNullifiedColumns(data: pd.DataFrame, percentage: float):
  missing_values_percentage = data.isnull().mean()
  nullified_columns = missing_values_percentage[missing_values_percentage >= percentage].index
  return nullified_columns

In [None]:
def get_rid_nullified(data: pd.DataFrame, percentage: float) -> pd.DataFrame:
  drop_these = getNullifiedColumns(data, percentage)
  return data.drop(drop_these, axis="columns")

In [None]:
def get_rid_incrementals(data: pd.DataFrame, percentage: Number = 1) -> pd.DataFrame:
  if percentage > 1:
    percentage = 1
  drop_these = []
  num_rows = len(data.index)
  for c in data.columns:
    if num_rows * percentage == data[c].nunique(dropna=False):
      drop_these.append(c)
  return data.drop(drop_these, axis="columns")

In [None]:
class ColumnDivider():
  def __init__(self, data: pd.DataFrame, target_feature: str, maximum_uniques: int):
    self.numeric_columns = data.dtypes[(data.dtypes == "float64") | (data.dtypes == "int64")].index.tolist()
    # maximum_uniques The number of unique values to be considered 'very numerical'.
    self.very_numerical = [nc for nc in self.numeric_columns if data[nc].nunique() > maximum_uniques]
    self.categorical_columns = [c for c in data.columns if c not in self.numeric_columns]  # and full_train_data[c].nunique() <= 5]
    self.ordinals = list(set(self.numeric_columns) - set(self.very_numerical))
    self.numeric_columns.remove(target_feature)
  def get_vn(self):
    return self.very_numerical # Returns all the columns that are very numerical (more than 10 unique values).
  def get_cat(self):
    return self.categorical_columns # Returns all the columns that are categorical. (Categories..)
  def get_ord(self):
    return self.ordinals # Returns all the columns that are ordinal (Less than or eql to 10 unique values).
  def get_num(self):
    return self.numeric_columns # Returns all the columns that are numeric (ordinal + very numerical)

In [None]:
def fill_null_model(data: pd.DataFrame, data_info: ColumnDivider):
  # Decided we are giving mean for very numerical, mode for ordinal (less than 10 features), and 'nan' new category for categorical.
  categorical_columns = data_info.get_cat()
  ord_columns = data_info.get_ord()
  vnum_columns = data_info.get_vn()
  all_cols = data.columns
  for c in categorical_columns:
    if c not in all_cols:
      continue
    data[c] = pd.Categorical(data[c])
    data[c] = data[c].cat.add_categories("nan")
    data[c] = data[c].fillna("nan") # replacing na to the category 'na'.
  for c in ord_columns:
    if c not in all_cols:
      continue
    if data[c].mode().tolist():
      data[c] = data[c].fillna(data[c].mode()[0])
    else:
      data[c] = data[c].fillna(0) # We can't really do anything else, except using 'get_rid_nullified'.
      
  for c in vnum_columns:
    if c not in all_cols:
      continue
    data[c] = data[c].fillna(data[c].mean(skipna=True)) # replacing na to mean of the values.
  
  return data

In [None]:
class MostCorrelated():
  most_correlated = {}
  binning = {}
  tf = ""
  di = None
  lists_len = 0
  data_info = None
  def __init__(self, data: pd.DataFrame, target_feature: str, num_correlated: int, data_info: ColumnDivider):

    self.tf = target_feature
    self.di = data_info
    if num_correlated >= len(data.columns) - 1:
      num_correlated = 10
    self.lists_len = num_correlated
    self.data_info = data_info

    cat_cols, vnum_cols, num_cols, ord_cols = data_info.get_cat(), data_info.get_vn(), data_info.get_num(), data_info.get_ord()
    # 2. Fill with a new 'na' category:
    for c in cat_cols:
      data[c] = pd.Categorical(data[c])
      data[c] = data[c].cat.add_categories("nan")
      data[c] = data[c].fillna("nan")

    # binning is in the object.
    for c in vnum_cols:
        try:
          data[c + '_binned'], self.binning[c] = pd.qcut(data[c], 5, labels=["very low", "low", "medium", "high", "very high"], retbins=True)
        except:
            # sometimes for highly skewed data, we cannot perform qcut as most quantiles are equal
          data[c + '_binned'], self.binning[c] = pd.cut(data[c], 5, labels=["very low", "low", "medium", "high", "very high"], retbins=True)
        finally:
          self.binning[c] = np.concatenate(([-np.inf], self.binning[c][1:-1], [np.inf]))
          data[c + '_binned'] = pd.Categorical(data[c + '_binned'])
          data[c + '_binned'] = data[c + '_binned'].cat.add_categories("nan")
          data[c + '_binned'] = data[c + '_binned'].fillna("nan")
          # data[c].replace(np.nan, "nan", inplace=True)
          # data.loc[data[c].isna(), c + '_binned'] = "nan"

    # Binned very numericals! But didn't change v_numerical's nulls to appropriate.. now its np.nan


    for c in ord_cols:
      data[c + '_binned'] = pd.Categorical(data[c])
      data[c + '_binned'] = data[c + '_binned'].cat.add_categories("nan").fillna("nan")

    ord_cols_binned = [c + '_binned' for c in ord_cols]
    vnum_cols_binned = [c + '_binned' for c in vnum_cols]
    check_corr_list = list()
    check_corr_list.extend(ord_cols_binned)
    check_corr_list.extend(vnum_cols_binned)
    check_corr_list.extend(cat_cols)
    self.new_col_list = check_corr_list
    set_corr = {}
    for c_main in check_corr_list:
      list_p_vals = []
      for c_second in check_corr_list:
        if c_main != c_second:
          contingency_table = pd.crosstab(data[c_main],data[c_second])
          c, p, dof, expected = stats.chi2_contingency(contingency_table)
          list_p_vals.append(p)
        else:
          list_p_vals.append(1.0)
      idx = np.argpartition(np.array(list_p_vals), 10) # Getting the 10 largest elements.
      set_corr[c_main] = idx[:10]

    for k, v in set_corr.items():
      self.most_correlated[k] = [check_corr_list[i] for i in v]
    # most_correlated = {k: [check_corr_list[i] for i in v] for k, v in set_corr.items()}
  def getMap(self) -> dict:
    return self.most_correlated
  def getMostCorrelated(self, feature: str) -> list:
    return self.most_correlated.get(feature) # Returns null if it doesn't exists.
  def getBinnings(self) -> dict:
    return self.binning
  def getBinningColumns(self) -> list:
    return self.new_col_list
  def getTarget(self) -> str:
    return self.tf
  def processTest(self, data: pd.DataFrame, len_test: int, min_supp: float = 0.1, min_conf: float = 0.75):
    cat_cols, vnum_cols, num_cols, ord_cols = self.data_info.get_cat(), self.data_info.get_vn(), self.data_info.get_num(), self.data_info.get_ord()
    # 2. Fill with a new 'na' category:
    for c in cat_cols:
      data[c] = pd.Categorical(data[c])
      data[c] = data[c].cat.add_categories("nan")
      data[c] = data[c].fillna("nan")
    # data[cat_cols] = data[cat_cols].fillna("nan") # Success! (Categorical doesnt have null values anymore!)
    for c in vnum_cols:
      data[c + '_binned'] = pd.cut(data[c], self.binning[c], labels=["very low", "low", "medium", "high", "very high"]) # This is probably the main giveaway for the error. TODO -fix.
      data[c + '_binned'] = pd.Categorical(data[c + '_binned'])
      data[c + '_binned'] = data[c + '_binned'].cat.add_categories("nan")
      data[c + '_binned'] = data[c + '_binned'].fillna("nan")

    for c in ord_cols:
      data[c + '_binned'] = pd.Categorical(data[c])
      data[c + '_binned'] = data[c + '_binned'].cat.add_categories("nan").fillna("nan")

    # for c in ['LotFrontage_binned', 'MasVnrArea_binned', 'OpenPorchSF_binned', 'GarageArea_binned', 'TotalBsmtSF_binned', 'BsmtFinSF1_binned', 'SalePrice_binned', 'RoofMatl', 'Neighborhood', '1stFlrSF_binned']:

    rule_list = list()
    for c in self.new_col_list: # All the new categories: 
      records = data[self.most_correlated[c]].to_dict(orient='records')
      transactions=[]
      for r in records:
          transactions.append(list(r.items()))
      itemsets, rules = ea.apriori(transactions, min_support=min_supp, min_confidence=min_conf,output_transaction_ids=False)
      rule_list.extend(rules)

    
    
    return rule_list, data




In [None]:
class Model():
  model_name: str = ""
  train_filename: str = ""
  this_model: xgb.XGBRegressor = None # The xgb regressor model.
  apriori_helper: MostCorrelated = None # MostCorrelated [change mc to self.apriori_helper].
  data_info: ColumnDivider = None # ColumnDivider [change cd to self.data_info].
  # original_data = None # Original train data.
  # Should I keep information about mistakes? rows I mistaken in the past..
  name_dict = set() # Connects between the name the user gives to the test, and it's path.
  encoder: OneHotEncoder = None
  err_margin = 0.05 # Make is so the user can enter it on it's own. todo
  def __init__(self, enc: OneHotEncoder = None, data_info: ColumnDivider = None):
    if enc is not None:
      self.encoder = enc
    if data_info is not None:
      self.data_info = data_info
  
  def getModel(self) -> xgb.XGBRegressor:
    return self.this_model

  def createModel(self, name: str) -> bool:
    self.model_name = name
    self.tf = ""
    print("Please upload your TRAIN file (.csv):\n")
    data_uploaded = files.upload()
    filenames_uploaded = list(data_uploaded.keys())

    if len(filenames_uploaded) != 1 or not filenames_uploaded[0].endswith('.csv'):
      print("\nError. File is not a \'.csv\' file.\n")
      return False
    # Next up - just upload and create a new model, train the train file (after all preprocess needed) andreturn to options.
    # true_data = pd.read_csv('/content/train.csv')
    self.train_filename = filenames_uploaded[0]

    original_data = pd.read_csv(io.StringIO(data_uploaded[self.train_filename].decode('utf-8')))

    # Ask for a target feature, and check if there is such column:
    target_feature = None
    while target_feature is None:
      target_feature = input("\nPlease enter the target feature: ")
      if target_feature not in original_data.columns:
        target_feature = None

    self.tf = target_feature # Save it into the model.
    

    # Consider what is the difference between Very numerical to Ordinal columns.
    max_uniq = None
    while max_uniq is None:
      max_uniq = input("\nEnter maximum number of uniques allowed for ordinal (Will fill NAN values with mode instead of mean): ")
      max_uniq = int(max_uniq) if max_uniq.isdecimal() else None

    # Consider how many features we are going to try to find common between, when facing errors.
    best_correlated_features = None
    while best_correlated_features is None:
      best_correlated_features = input("\nEnter number of features we will test against, when facing errors: ")
      best_correlated_features = int(best_correlated_features) if best_correlated_features.isdecimal() else None


    data_for_binning = original_data.copy(deep=True) # For binning.

    if self.data_info is None:
      self.data_info = ColumnDivider(data_for_binning, self.tf, max_uniq)
    self.apriori_helper = MostCorrelated(data_for_binning, self.tf, best_correlated_features, self.data_info) # Will be needed for making apriori rules.
    
    categorical_columns = self.data_info.get_cat()


    ## If in the end I will need to seperate the data to hot encode.
    # copied_data_ohe = true_data.copy(deep=True)
    data_for_ohe = original_data.copy(deep=True)

    #### TESTING!!!
    print('PoolQC' in data_for_ohe)
    data_for_ohe_filled = fill_null_model(data_for_ohe, self.data_info) # Didnt need that??
    print('PoolQC' in data_for_ohe_filled )
    if self.encoder is None:
      self.encoder=OneHotEncoder(sparse=False)
      data_for_ohe_filled_encoded = pd.DataFrame(self.encoder.fit_transform(data_for_ohe_filled[categorical_columns]))
    else:
      data_for_ohe_filled_encoded = pd.DataFrame(self.encoder.transform(data_for_ohe_filled[categorical_columns]))

    data_for_ohe_filled_encoded.columns = self.encoder.get_feature_names(categorical_columns)

    data_for_ohe_filled.drop(categorical_columns ,axis=1, inplace=True)

    OHE_data = pd.concat([data_for_ohe_filled, data_for_ohe_filled_encoded], axis=1)

    self.this_model = xgb.XGBRegressor(n_estimators=100, random_state=0)
    self.this_model.fit(OHE_data.drop(self.tf, axis=1), OHE_data[self.tf])
    return True

  def uploadTest(self) -> bool:
    print("Please upload your TEST file (.csv):\n")
    data_uploaded = files.upload()
    filenames_uploaded = list(data_uploaded.keys())
    if len(filenames_uploaded) != 1 or not filenames_uploaded[0].endswith('.csv'):
      print("\nError. File is not a \'.csv\' file.\n")
      # return False
    # That is okay if more than 1 model use the same test file.
    self.name_dict.add(filenames_uploaded[0])

    categorical_columns = self.data_info.get_cat()

    # test_data = pd.read_csv('/content/' + filenames_uploaded[0]) # NO GOOD - get it from the uploaded data...
    test_data = pd.read_csv(io.StringIO(data_uploaded[filenames_uploaded[0]].decode('utf-8')))

    test_data_for_ohe_filled = fill_null_model(test_data, self.data_info)

    test_data_for_ohe_filled_encoded = pd.DataFrame(self.encoder.transform(test_data_for_ohe_filled[categorical_columns]))

    test_data_for_ohe_filled_encoded.columns = self.encoder.get_feature_names(categorical_columns)

    test_data_for_ohe_filled.drop(categorical_columns ,axis=1, inplace=True)

    OHE_test_data = pd.concat([test_data_for_ohe_filled, test_data_for_ohe_filled_encoded], axis=1)

    predictions_test = self.this_model.predict(OHE_test_data)

    # Prediction should be $modelname.$testname.txt
    try:
      with open('/content/' + self.model_name + '_' + filenames_uploaded[0] + '.txt', 'w') as f:
        for idx, row_pred in enumerate(predictions_test):
          if idx < len(predictions_test) - 1: # If not the final prediction..
            f.write(str(row_pred) + '\n')
          else: # If it is the final prediction.. (w/o newline)
            f.write(str(row_pred))
    except:
      print("\nError. Couldn't write the prediction file (\'.txt\').\n")
      return False
    return True
    
  def uploadPrevResults(self) -> bool:
    print("\nTEST RESULT file should be a text file, with each row having \nthe real target feature value of the compatible row in the data.\n")
    print("Make sure the file does NOT end with a newline.\n")
    print("Please upload your TEST RESULT file (.txt):\n")
    data_uploaded = files.upload()
    filenames_uploaded = list(data_uploaded.keys())
    if len(filenames_uploaded) != 1 or not filenames_uploaded[0].endswith('.txt'):
      print("\nError. File is not a \'.txt\' file.\n")
      # return False
      return False
    
    test_filename = 'nan'
    while test_filename == 'nan' or test_filename not in self.name_dict:
      test_filename = input("Please enter a valid test filename:\n")
    

    
    list_true_res = data_uploaded[filenames_uploaded[0]].decode("utf-8").split('\n')
    
    try:
      with open('/content/' + self.model_name + '_' + test_filename + '.txt', "r") as f:
        test_predictions = [line.rstrip() for line in f]
    except:
      print('Error. Failed to open saved prediction files.')
    
    if len(list_true_res) != len(test_predictions):
      print('Error. Not equal number of predictions and true values.\n')
      # return False
      return False
    
    true_vs_pred = list()
    try:
      for true_val, pred in zip(list_true_res, test_predictions):
        true_vs_pred.append((float(true_val), float(pred)))
    except:
      print('Value Error. Couldn\'t transfer one of the true values or prediction values to numeric value.\n')
      # return False
      return False
    
    
    # Continue to calculate errors (based on user first input on what is considered error), and do apriori and fit the model if found common itemsets~~

    err_margin_list = [(abs(item[0] - item[1]) / item[0]) for item in true_vs_pred]
    err_indices = list()
    for idx, item in enumerate(err_margin_list):
      if item > self.err_margin:
        err_indices.append(idx)

    
    test_df = pd.read_csv('/content/' + test_filename)
    # test_df = test_df.reset_index()
    test_df = test_df.iloc[err_indices]

    sale_price_list = list()
    for i in err_indices:
      sale_price_list.append(true_vs_pred[i][0])

    test_df[self.tf] = sale_price_list # We are inserting the true values of the rows (received from the user).


    
    
    # Maybe insert a column with their real value..?

    test_rows_len = len(true_vs_pred)

    rule_list, test_transformed = self.apriori_helper.processTest(test_df.copy(deep=True), test_rows_len)

    rule_list = sorted(rule_list, key=lambda rule: (len(rule.lhs) + len(rule.rhs)) * rule.support) # The most lengthiest rule, which has high support (common itemset)
    
    row_atleast = min(math.ceil(0.2 * test_rows_len), len(err_indices)) # It will be the percentage from the test length we are willing to take, based on errors. 0.2 * len(test).

    rules_atleast = math.ceil(0.1 * len(rule_list))

    indices_rows = set()
    

    if row_atleast != len(err_indices):
      for rule in rule_list:
        rules_atleast = rules_atleast - 1
        for index, row in test_transformed.iterrows():
          if index in indices_rows:
            continue
          fail = False
          for item in rule.rhs:
            if row[item[0]] != item[1]:
              fail = True
              break
          if fail == True:
            continue
          for item in rule.lhs:
            if row[item[0]] != item[1]:
              fail = True
              break
          if fail == False:
            indices_rows.add(index)
        if rules_atleast == 0:
          break
        if len(indices_rows) >= row_atleast:
          break
      indices_rows = list(indices_rows)
    else:
      indices_rows = []


    if indices_rows:
      insert_these = []
      for index, row in test_df.iterrows():
        if index in indices_rows:
          insert_these.append(row.values)
      test_df = pd.DataFrame(insert_these, columns = test_df.columns).reset_index() # take the rows from test_df that has the same indices as transformed (we took indices of rows that comply with our rules after all..)
    else:
      pass # take everything??

    


    categorical_columns = self.data_info.get_cat()
    

    test_data_fit_filled = fill_null_model(test_df, self.data_info)


    test_data_fit_filled_encoded = pd.DataFrame(self.encoder.transform(test_data_fit_filled[categorical_columns]))

    test_data_fit_filled_encoded.columns = self.encoder.get_feature_names(categorical_columns)

    test_data_fit_filled.drop(categorical_columns ,axis=1, inplace=True)


    OHE_test_data_fit = pd.concat([test_data_fit_filled.reset_index(drop=True), test_data_fit_filled_encoded.reset_index(drop=True)], axis=1) # Changed into 'ignore_index=True'.
    if 'index' in OHE_test_data_fit:
      OHE_test_data_fit.drop('index', axis=1, inplace=True)

    print(OHE_test_data_fit.head())

    self.this_model = self.this_model.fit(OHE_test_data_fit.drop(self.tf, axis=1), OHE_test_data_fit[self.tf], xgb_model=self.this_model.get_booster())

        
            
        
      # Just check if a row consist with all of these features.
    


    # run the apriori on them. take only x rules by user's input.



    # return True
    return True
  
    
    


In [None]:
def our_project(enc: OneHotEncoder = None, data_info: ColumnDivider = None):
  models = {} # Todo use that to contain all Models and their name
  while True:
    option = 0
    while option < 1 or option > 4:
      option = input("Please choose:\n1. Create New Model.\n2. Upload another test file for prediction.\n3. Upload results for a previous test file.\n4. Return a requested model.\nEnter option: ")
      option = int(option) if option.isdecimal() else 0
    if option == 1:
      model_name = "nan"
      while model_name == "nan":
        model_name = input("\nEnter a name for the new model (except \'nan\'): \n")
      models[model_name] = Model(enc, data_info)
      if not models[model_name].createModel(model_name):
        del models[model_name]
        
    elif option == 2:
      model_name = "nan"
      while model_name == "nan" or model_name not in models.keys():
        model_name = input("\nEnter the name of the model (except \'nan\'): \n")
      models[model_name].uploadTest() # We dont care if it is failed or not.

    elif option == 3:
      model_name = "nan"
      while model_name == "nan" or model_name not in models.keys():
        model_name = input("\nEnter the name of the model (except \'nan\'): \n")
      # models[model_name].uploadPrevResults()
      models[model_name].uploadPrevResults()
    else:
      model_name = "nan"
      while model_name == "nan" or model_name not in models.keys():
        model_name = input("\nEnter the name of the model (except \'nan\'): \n")
      return models[model_name].getModel()
    

In [None]:
# def not_our_project(df_train, df_tests):
def main(): # Our main shows us the difference between our solution and a normal training (training on all mistakes)
  data = pd.read_csv('/content/train.csv')

  data_info = ColumnDivider(data.copy(deep=True), 'SalePrice', 10)
  categorical_columns = data_info.get_cat()

  saved_train = data.sample(frac=0.05, random_state=200)
  test = data.drop(saved_train.index)
  saved_test1, saved_test2, saved_test3 = test.sample(frac=0.2 ,random_state=190), test.sample(frac=0.2 ,random_state=180), test.sample(frac=0.2 ,random_state=170)

  saved_test1_answers, saved_test2_answers, saved_test3_answers = saved_test1['SalePrice'], saved_test2['SalePrice'], saved_test3['SalePrice']
  saved_test1, saved_test2, saved_test3 = saved_test1.drop('SalePrice', axis=1), saved_test2.drop('SalePrice', axis=1), saved_test3.drop('SalePrice', axis=1)


  data_filled = fill_null_model(data.copy(deep=True), data_info)

  encoder = OneHotEncoder(sparse=False)

  data_filled_encoded = pd.DataFrame(encoder.fit_transform(data_filled[categorical_columns]))

  data_filled_encoded.columns = encoder.get_feature_names(categorical_columns)

  data_filled.drop(categorical_columns ,axis=1, inplace=True)

  OHE_data = pd.concat([data_filled.reset_index(drop=True), data_filled_encoded.reset_index(drop=True)], axis=1)


  train = OHE_data.sample(frac=0.5,random_state=200)
  test = OHE_data.drop(train.index)

  test1, test2, test3 = test.sample(frac=0.2 ,random_state=190), test.sample(frac=0.2 ,random_state=180), test.sample(frac=0.2 ,random_state=170)

  test1_answers, test2_answers, test3_answers = test1['SalePrice'], test2['SalePrice'], test3['SalePrice']
  test_final = test.copy(deep=True)
  test_final.drop(test1.index, inplace = True, errors='ignore')
  test_final.drop(test2.index, inplace = True, errors='ignore')
  test_final.drop(test3.index, inplace = True, errors='ignore')
  test1, test2, test3 = test1.drop('SalePrice', axis=1), test2.drop('SalePrice', axis=1), test3.drop('SalePrice', axis=1)

  
  


  normal_model = xgb.XGBRegressor(n_estimators=100, random_state=0)

  normal_model.fit(train.drop('SalePrice', axis=1), train['SalePrice'])


  # Test1
  prediction_normal_test1 = normal_model.predict(test1)

  test1_answers_list = test1_answers.values.tolist()
  
  indices_list =list()
  for i in range(len(test1_answers)):
    if abs(prediction_normal_test1[i] - test1_answers_list[i]) / test1_answers_list[i] < 0.05:
      indices_list.append(i)
  
  normal_model = normal_model.fit(test1.iloc[indices_list], test1_answers.iloc[indices_list], xgb_model=normal_model.get_booster())


  # Test2
  prediction_normal_test2 = normal_model.predict(test2)
  test2_answers_list = test2_answers.values.tolist()
  
  indices_list = list()
  for i in range(len(test2_answers)):
    if abs(prediction_normal_test2[i] - test2_answers_list[i]) / test2_answers_list[i] < 0.05:
      indices_list.append(i)
  
  normal_model = normal_model.fit(test2.iloc[indices_list], test2_answers.iloc[indices_list], xgb_model=normal_model.get_booster())

  # Test3
  prediction_normal_test3 = normal_model.predict(test3)
  test3_answers_list = test3_answers.values.tolist()
  
  indices_list =list()
  for i in range(len(test3_answers)):
    if abs(prediction_normal_test3[i] - test3_answers_list[i]) / test3_answers_list[i] < 0.05:
      indices_list.append(i)
  
  normal_model = normal_model.fit(test3.iloc[indices_list], test3_answers.iloc[indices_list], xgb_model=normal_model.get_booster())

  final_predictions_normal = normal_model.predict(test_final.drop('SalePrice', axis=1)) # normal model.
  true_values_test = test_final['SalePrice'].values.tolist()


  count_failed = 0
  for i in range(len(true_values_test)):
    if abs(final_predictions_normal[i] - true_values_test[i]) / true_values_test[i] < 0.05:
      count_failed = count_failed + 1
    
  normal_success = count_failed / len(true_values_test)


  train = data.sample(frac=0.5,random_state=200)
  test = data.drop(train.index)
  test1, test2, test3 = test.sample(frac=0.2 ,random_state=190), test.sample(frac=0.2 ,random_state=180), test.sample(frac=0.2 ,random_state=170)

  test1_answers, test2_answers, test3_answers = test1['SalePrice'].values.tolist(), test2['SalePrice'].values.tolist(), test3['SalePrice'].values.tolist()
  test1, test2, test3 = test1.drop('SalePrice', axis=1), test2.drop('SalePrice', axis=1), test3.drop('SalePrice', axis=1)

  
  saved_train.to_csv(Path('/content/train1.csv'), index=False)
  print(saved_train.columns)
  
  saved_test1.to_csv(Path('/content/test1.csv'), index=False)
  saved_test2.to_csv(Path('/content/test2.csv'), index=False)
  saved_test3.to_csv(Path('/content/test3.csv'), index=False)

  try:
    with open('/content/results_test1.txt', 'w') as f:
      for idx, ans in enumerate(saved_test1_answers):
        if idx < len(saved_test1_answers) - 1: # If not the final prediction..
          f.write(str(ans) + '\n')
        else: # If it is the final prediction.. (w/o newline)
          f.write(str(ans))
  except:
    print("\nError. Couldn't write the prediction file (\'.txt\').\n")


  try:
    with open('/content/results_test2.txt', 'w') as f:
      for idx, ans in enumerate(saved_test2_answers):
        if idx < len(saved_test2_answers) - 1: # If not the final prediction..
          f.write(str(ans) + '\n')
        else: # If it is the final prediction.. (w/o newline)
          f.write(str(ans))
  except:
    print("\nError. Couldn't write the prediction file (\'.txt\').\n")

  try:
    with open('/content/results_test3.txt', 'w') as f:
      for idx, ans in enumerate(saved_test3_answers):
        if idx < len(saved_test3_answers) - 1: # If not the final prediction..
          f.write(str(ans) + '\n')
        else: # If it is the final prediction.. (w/o newline)
          f.write(str(ans))
  except:
    print("\nError. Couldn't write the prediction file (\'.txt\').\n")


  our_model = our_project(encoder, data_info)
  our_predictions = our_model.predict(test_final.drop('SalePrice', axis=1))
  print('our_pred\n', our_predictions)
  count_failed = 0
  for i in range(len(true_values_test)):
    if abs(our_predictions[i] - true_values_test[i]) / true_values_test[i] < 0.05:
      count_failed = count_failed + 1

  our_success = count_failed / len(true_values_test)

  print('Our success: ', our_success, '\nNormal Success: ', normal_success)
# main()
our_project()


  
      

  

  


Please upload your TRAIN file (.csv):



Saving train1.csv to train1 (1).csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  agged[v] = maybe_downcast_to_dtype(agged[v], data[v].dtype)


True
True
