In [1]:
from gezi.common import *
sys.path.append('..')
gezi.set_pandas()
# gezi.set_pandas_widder()
from src.config import *
gezi.init_flags()

In [2]:
def create_df(folder, workers=80):
  def _create_df(fpath):
    df = pd.read_json(fpath, dtype={'cell_type': 'category', 'source': 'str'}).reset_index().rename({"index":"cell_id"}, axis=1)
    df["id"] = fpath.rsplit(".", 1)[0].rsplit("/", 1)[-1]
    return df
  dfs = gezi.prun(_create_df, glob.glob(f'{folder}/*.json'), workers)
  df = pd.concat(dfs)
  df['source'] = df.source.apply(lambda x: x.replace('\n', BR))
  return df

In [3]:
def to_rank(df_orders, cell_dict):
  ids = []
  cell_ids = []
  ranks = []
  code_ranks = []
  markdown_ranks = []
  n_cells = []
  n_code_cells = []
  n_markdown_cells = []
  markdown_fracs = []
  rel_ranks = []
  for row in df_orders.itertuples():
    cells = row.cell_order.split() 
    ncell = len(cells)
    n_cells.extend([ncell] * ncell)
    ids.extend([row.id] * ncell)
    cell_ids.extend(cells)
    ranks.extend(list(range(ncell)))
    code_ranks_ = [-1] * ncell
    markdown_ranks_ = [-1] * ncell
    code_rank, markdown_rank = 0, 0
    
    for i, cell in enumerate(cells):
      if cell_dict[cell] == 'code':
        code_ranks_[i] = code_rank
        code_rank += 1
      else:
        markdown_ranks_[i] = markdown_rank
        markdown_rank += 1
    code_ranks.extend(code_ranks_)
    markdown_ranks.extend(markdown_ranks_)
    n_code_cells.extend([code_rank] * ncell)
    n_markdown_cells.extend([markdown_rank] * ncell)
    markdown_fracs.extend([markdown_rank / (code_rank + markdown_rank)] * ncell)
    
    ncode, n_markdown = code_rank, markdown_rank
    code_rank, markdown_rank = 0, 0
    rel_ranks_ = [-1] * ncell
    for i, cell in enumerate(cells):
      if cell_dict[cell] == 'code':
        prev = code_rank * (1 / (ncode + 1))
        code_rank += 1
        rel_ranks_[i] = code_rank * (1 / (ncode + 1))
        
        j = i - 1
        while j >= 0 and rel_ranks_[j] >= 1:
          rel_ranks_[j] = prev + rel_ranks_[j] * ((1 / (ncode + 1)) / (markdown_rank + 1))
          j -= 1
        markdown_rank = 0
      else:
        markdown_rank += 1
        rel_ranks_[i] = markdown_rank
    j = i 
    prev = code_rank * (1 / (ncode + 1))
    while j >= 0 and rel_ranks_[j] >= 1:
      rel_ranks_[j] = prev + rel_ranks_[j] * ((1 / (ncode + 1)) / (markdown_rank + 1))
      j -= 1
    rel_ranks.extend(rel_ranks_)
    
  df_rank = pd.DataFrame({
    'id': ids,
    'cell_id': cell_ids,
    'n_cell': n_cells,
    'n_code_cell': n_code_cells,
    'n_markdown_cell': n_markdown_cells,
    'markdown_frac': markdown_fracs,
    'rank': ranks,
    'code_rank': code_ranks,
    'markdown_rank': markdown_ranks,
    'rel_rank': rel_ranks,
  })
  return df_rank

In [4]:
def set_fold(df):
  from sklearn.model_selection import GroupKFold
  folds = 5
  seed = 1024
  np.random.seed(seed)
  gezi.set_fold_worker(df, folds, workers=80, group_key='ancestor_id')

In [5]:
def get_df(mark='train'):
  workers = 80 if mark == 'train' else 1
  train_file = f'{FLAGS.root}/{mark}.fea'
  if os.path.exists(train_file):
    df = pd.read_feather(train_file)
  else:
    df = create_df(f'{FLAGS.root}/{mark}', workers)
  if not 'rank' in df.columns:
    cell_dict = dict(zip(df.cell_id.values, df.cell_type.values))
    if mark == 'train':
      df_ancestors = pd.read_csv(f'{FLAGS.root}/train_ancestors.csv')
      df = df.merge(df_ancestors, on=['id'])
      df_orders = pd.read_csv(f'{FLAGS.root}/train_orders.csv')
    else:
      df_orders = df.groupby('id')['cell_id'].apply(list).reset_index(name='cell_order')
      df_orders['cell_order'] = df_orders.cell_order.apply(lambda x: ' '.join(x))
    
    df_rank = to_rank(df_orders, cell_dict)
    df_rank['pct_rank'] = (1. / (df_rank['n_cell'] - 1)) * df_rank['rank']  
    
    df = df.merge(df_rank, on=['id', 'cell_id'])
  if mark == 'train':
    df = df.sort_values(['id', 'cell_id'])
    set_fold(df)
    df.reset_index().to_feather(train_file)
  return df

In [6]:
df = get_df()
df

run:   0%|          | 0/1741 [00:00<?, ?it/s]

  0%|          | 0/80 [00:00<?, ?it/s]

Unnamed: 0,cell_id,cell_type,source,id,ancestor_id,parent_id,n_cell,n_code_cell,n_markdown_cell,markdown_frac,rank,code_rank,markdown_rank,rel_rank,pct_rank,fold,worker
2541993,032e2820,markdown,"Создаем список признаков, используемых в модели - отбор признаков",00001756c60be8,945aea18,,58,30,28,0.4828,46,-1,23,0.7581,0.8070,3,3
2541957,038b763d,code,import warningsʶwarnings.filterwarnings('ignore'),00001756c60be8,945aea18,,58,30,28,0.4828,4,2,-1,0.0968,0.0702,3,43
2541979,06365725,code,train_df = train_df[feature_names + [target_name]]ʶtest_df = test_df[feature_names + ['Id']]ʶX = train_df[feature_names]ʶy = train_df[target_name],00001756c60be8,945aea18,,58,30,28,0.4828,48,24,-1,0.8065,0.8421,3,58
2541959,0beab1cd,code,"def evaluate_preds(train_true_values, train_pred_values, test_true_values, test_pred_values):ʶ print(""Train R2:\t"" + str(round(r2(train_true_values, train_pred_values), 3)))ʶ print(""Test R2:\t"" + str(round(r2(test_true_values, test_pred_values), 3)))ʶ ʶ plt.figure(figsize=(18,10))ʶ ʶ plt.subplot(121)ʶ sns.scatterplot(x=train_pred_values, y=train_true_values)ʶ plt.xlabel('Predicted values')ʶ plt.ylabel('True values')ʶ plt.title('Train sample prediction')ʶ ʶ ...",00001756c60be8,945aea18,,58,30,28,0.4828,8,4,-1,0.1613,0.1404,3,23
2542002,0d136e08,markdown,**Загрузка данных**,00001756c60be8,945aea18,,58,30,28,0.4828,11,-1,5,0.2016,0.1930,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3105692,e70a860e,code,df['bhk'] = df['size'].apply(lambda x: int(x.split(' ')[0])),fffe1d764579d5,3c40bfa6,,72,61,11,0.1528,16,13,-1,0.2258,0.2254,3,68
3105721,ec3a94d7,code,df = df.dropna(),fffe1d764579d5,3c40bfa6,,72,61,11,0.1528,48,42,-1,0.6935,0.6761,3,53
3105703,ecf7b4a6,code,df.info(),fffe1d764579d5,3c40bfa6,,72,61,11,0.1528,27,24,-1,0.4032,0.3803,3,3
3105681,f71c538e,code,df = pd.read_csv('/kaggle/input/bengaluru-house-price-data/Bengaluru_House_Data.csv'),fffe1d764579d5,3c40bfa6,,72,61,11,0.1528,4,2,-1,0.0484,0.0563,3,38


In [13]:
df[df.cell_id=='586de380']

Unnamed: 0,cell_id,cell_type,source,id,ancestor_id,parent_id,n_cell,n_code_cell,n_markdown_cell,markdown_frac,rank,code_rank,markdown_rank,rel_rank,pct_rank,fold,worker
4324597,586de380,markdown,# Part-1 Applying Different Estimators For Simple Classification Problem,001106f5f235f6,ac35f431,ca743ee8531539,70,60,10,0.1429,0,-1,0,0.0082,0.0,0,0


In [7]:
df_test = get_df('test')
df_test

run:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,cell_id,cell_type,source,id,n_cell,n_code_cell,n_markdown_cell,markdown_frac,rank,code_rank,markdown_rank,rel_rank,pct_rank
0,ddfd239c,code,"import numpy as np # linear algebraʶimport pandas as pd # data processing,ʶimport matplotlib.pyplot as pltʶfrom sklearn.decomposition import PCAʶfrom sklearn.preprocessing import StandardScalerʶfrom sklearn.preprocessing import scaleʶfrom sklearn.impute import SimpleImputerʶʶʶimport osʶfor dirname, _, filenames in os.walk('/kaggle/input'):ʶ for filename in filenames:ʶ print(os.path.join(dirname, filename))",0009d135ece78d,13,7,6,0.4615,0,0,-1,0.125,0.0
1,c6cd22db,code,df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')ʶdf,0009d135ece78d,13,7,6,0.4615,1,1,-1,0.25,0.0833
2,1372ae9b,code,"numerical_data = df.loc[:, ~df.columns.isin(['id', ""diagnosis""])]ʶʶlabels = df[""diagnosis""].factorize(['B','M'])[0]ʶʶheader_labels = pd.DataFrame(data=labels, columns=[""diagnosis""])",0009d135ece78d,13,7,6,0.4615,2,2,-1,0.375,0.1667
3,90ed07ab,code,"def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):ʶ # Scaling Data for testingʶ # data_1 = scale(data_1)ʶ # data_2 = scale(data_2)ʶʶ range = np.random.randn(len(data_1))ʶ plt.scatter(range, data_1, label=column_name_1, color='orange')ʶ plt.scatter(range, data_2, label=column_name_2, color='green')ʶ plt.title(name)ʶ plt.xlabel('X-Axis')ʶ plt.ylabel('Y-Axis')ʶ plt.legend()ʶ plt.show()ʶ",0009d135ece78d,13,7,6,0.4615,3,3,-1,0.5,0.25
4,7f388a41,code,"# Ploting data with different columnsʶ#####################################ʶcomparison_plot_maker(numerical_data[""radius_mean""], numerical_data[""radius_worst""], ""Mean Radius vs Worst Radius"", ""Mean Radius"", ""Worst Radius"")ʶcomparison_plot_maker(numerical_data[""perimeter_se""], numerical_data[""perimeter_worst""], ""S.D Perimeter vs Worst Perimeter"", ""S.D Perimeter"", ""Worst Perimeter"")ʶcomparison_plot_maker(numerical_data[""compactness_mean""], numerical_data[""compactness_se""], ""Mean Compactness vs...",0009d135ece78d,13,7,6,0.4615,4,4,-1,0.625,0.3333
5,2843a25a,code,"# Scaling Dataʶscaler = StandardScaler()ʶscaler.fit(numerical_data)ʶ# print(scaled_data)ʶʶ# Assigning VariablesʶX = scaler.transform(numerical_data)ʶy = labelsʶʶmy_imputer = SimpleImputer()ʶpd.DataFrame(X).fillna(0)ʶX = my_imputer.fit_transform(X)ʶʶprint(""Ignore the errors, they occurred because of NaN values"")ʶprint()ʶprint(""But worry not human! The errors are fixed with Imputer >o>"")ʶprint()",0009d135ece78d,13,7,6,0.4615,5,5,-1,0.75,0.4167
6,06dbf8cf,code,"# 3. Implementing PCA on X (green for benign; red for malignant)ʶ################################################################ʶʶ# PCAʶPCA3=PCA(n_components=2)ʶ# print(X.shape)ʶPCA3.fit(X)ʶXPCA = PCA3.transform(X)ʶ# print(XPCA.shape)ʶʶ# Plottingʶplt.figure()ʶplt.title(""PCA"")ʶplt.xlabel('X-Axis')ʶplt.ylabel('Y-Axis')ʶʶplt.plot(XPCA[y==0,0],XPCA[y==0,1],'g.')ʶplt.plot(XPCA[y==1,0],XPCA[y==1,1],'r.')ʶʶplt.show()",0009d135ece78d,13,7,6,0.4615,6,6,-1,0.875,0.5
7,f9893819,markdown,# Scaling Data ⚖ʶLet's scale the data so PCA can be applied,0009d135ece78d,13,7,6,0.4615,7,-1,0,0.8929,0.5833
8,ba55e576,markdown,## Testing Plots >w>ʶLet's these mystery soliving plots! :O,0009d135ece78d,13,7,6,0.4615,8,-1,1,0.9107,0.6667
9,39e937ec,markdown,"## Plotting PCA 📊ʶThus, the sun boils down to this, the PCA is hence plotted 😮",0009d135ece78d,13,7,6,0.4615,9,-1,2,0.9286,0.75


In [8]:
df_test

Unnamed: 0,cell_id,cell_type,source,id,n_cell,n_code_cell,n_markdown_cell,markdown_frac,rank,code_rank,markdown_rank,rel_rank,pct_rank
0,ddfd239c,code,"import numpy as np # linear algebraʶimport pandas as pd # data processing,ʶimport matplotlib.pyplot as pltʶfrom sklearn.decomposition import PCAʶfrom sklearn.preprocessing import StandardScalerʶfrom sklearn.preprocessing import scaleʶfrom sklearn.impute import SimpleImputerʶʶʶimport osʶfor dirname, _, filenames in os.walk('/kaggle/input'):ʶ for filename in filenames:ʶ print(os.path.join(dirname, filename))",0009d135ece78d,13,7,6,0.4615,0,0,-1,0.125,0.0
1,c6cd22db,code,df = pd.read_csv('/kaggle/input/breast-cancer-wisconsin-data/data.csv')ʶdf,0009d135ece78d,13,7,6,0.4615,1,1,-1,0.25,0.0833
2,1372ae9b,code,"numerical_data = df.loc[:, ~df.columns.isin(['id', ""diagnosis""])]ʶʶlabels = df[""diagnosis""].factorize(['B','M'])[0]ʶʶheader_labels = pd.DataFrame(data=labels, columns=[""diagnosis""])",0009d135ece78d,13,7,6,0.4615,2,2,-1,0.375,0.1667
3,90ed07ab,code,"def comparison_plot_maker(data_1, data_2, name, column_name_1, column_name_2):ʶ # Scaling Data for testingʶ # data_1 = scale(data_1)ʶ # data_2 = scale(data_2)ʶʶ range = np.random.randn(len(data_1))ʶ plt.scatter(range, data_1, label=column_name_1, color='orange')ʶ plt.scatter(range, data_2, label=column_name_2, color='green')ʶ plt.title(name)ʶ plt.xlabel('X-Axis')ʶ plt.ylabel('Y-Axis')ʶ plt.legend()ʶ plt.show()ʶ",0009d135ece78d,13,7,6,0.4615,3,3,-1,0.5,0.25
4,7f388a41,code,"# Ploting data with different columnsʶ#####################################ʶcomparison_plot_maker(numerical_data[""radius_mean""], numerical_data[""radius_worst""], ""Mean Radius vs Worst Radius"", ""Mean Radius"", ""Worst Radius"")ʶcomparison_plot_maker(numerical_data[""perimeter_se""], numerical_data[""perimeter_worst""], ""S.D Perimeter vs Worst Perimeter"", ""S.D Perimeter"", ""Worst Perimeter"")ʶcomparison_plot_maker(numerical_data[""compactness_mean""], numerical_data[""compactness_se""], ""Mean Compactness vs...",0009d135ece78d,13,7,6,0.4615,4,4,-1,0.625,0.3333
5,2843a25a,code,"# Scaling Dataʶscaler = StandardScaler()ʶscaler.fit(numerical_data)ʶ# print(scaled_data)ʶʶ# Assigning VariablesʶX = scaler.transform(numerical_data)ʶy = labelsʶʶmy_imputer = SimpleImputer()ʶpd.DataFrame(X).fillna(0)ʶX = my_imputer.fit_transform(X)ʶʶprint(""Ignore the errors, they occurred because of NaN values"")ʶprint()ʶprint(""But worry not human! The errors are fixed with Imputer >o>"")ʶprint()",0009d135ece78d,13,7,6,0.4615,5,5,-1,0.75,0.4167
6,06dbf8cf,code,"# 3. Implementing PCA on X (green for benign; red for malignant)ʶ################################################################ʶʶ# PCAʶPCA3=PCA(n_components=2)ʶ# print(X.shape)ʶPCA3.fit(X)ʶXPCA = PCA3.transform(X)ʶ# print(XPCA.shape)ʶʶ# Plottingʶplt.figure()ʶplt.title(""PCA"")ʶplt.xlabel('X-Axis')ʶplt.ylabel('Y-Axis')ʶʶplt.plot(XPCA[y==0,0],XPCA[y==0,1],'g.')ʶplt.plot(XPCA[y==1,0],XPCA[y==1,1],'r.')ʶʶplt.show()",0009d135ece78d,13,7,6,0.4615,6,6,-1,0.875,0.5
7,f9893819,markdown,# Scaling Data ⚖ʶLet's scale the data so PCA can be applied,0009d135ece78d,13,7,6,0.4615,7,-1,0,0.8929,0.5833
8,ba55e576,markdown,## Testing Plots >w>ʶLet's these mystery soliving plots! :O,0009d135ece78d,13,7,6,0.4615,8,-1,1,0.9107,0.6667
9,39e937ec,markdown,"## Plotting PCA 📊ʶThus, the sun boils down to this, the PCA is hence plotted 😮",0009d135ece78d,13,7,6,0.4615,9,-1,2,0.9286,0.75
