# Create train test sample
for providers and us to test our final results.

 * [Dev variables](#Dev-variables)
 * [Load data](#Load-data)
 * [Prepare data](#Prepare-data)
 * [Create sample](#Create-sample)
 * [Export files](#Export-files)

## Dev variables

In [1]:
# data_root_folder='H:/AI_for_Selection/'
data_root_folder='/media/AIDrive/'


## Load data

In [2]:
import os
import sys
import numpy as np
import pandas as pd
import gensim
import spacy
import nltk

import logging
from gensim.summarization import summarize

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', -1)

In [3]:

folder_path_a = data_root_folder + 'a/'
folder_path_websites = data_root_folder + '/Websites/'
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 

In [4]:
#Metadata file is the supplied by MA to us
metadata_file =  data_root_folder + 'objective_files_with_acronyms LATEST.xlsx'
metadata_df = pd.read_excel(open(metadata_file,'rb'), sheet_name='objective_files_with_acronyms', nrows=118677)

In [5]:
len(metadata_df)

118677

## Prepare data

In [6]:
metadata_df.columns

Index(['Unnamed: 0', 'documentid', 'objectivefileid', 'fileextension',
       'versionnumber', 'disposal_schedule', 'repository', 'datelastmodified',
       'parent11', 'parent10', 'parent9', 'parent8', 'parent7', 'parent6',
       'parent5', 'parent4', 'parent3', 'parent2', 'parent1', 'objective3',
       'objective2', 'objective1', 'originalname', 'documentname',
       'copyflatlines',
       'To the left there is the full data from the EDRMS.\nTo the right is the data with the acronyms',
       'trim_11', 'trim_10', 'trim_9', 'trim_8', 'trim_7', 'trim_6', 'trim_5',
       'trim_4', 'trim_3', 'trim_2', 'trim_1'],
      dtype='object')

In [7]:
metadata_df = metadata_df.drop(['Unnamed: 0','copyflatlines',
       'To the left there is the full data from the EDRMS.\nTo the right is the data with the acronyms'], axis = 1)

In [8]:
metadata_df.head(5)

Unnamed: 0,documentid,objectivefileid,fileextension,versionnumber,disposal_schedule,repository,datelastmodified,parent11,parent10,parent9,...,trim_10,trim_9,trim_8,trim_7,trim_6,trim_5,trim_4,trim_3,trim_2,trim_1
0,A3109716,qA35736,xls,2,24 Projects - Full Projects (Close file when Project ends),Strategic Projects,00:42:31,"Procurement, Project Delivery and Contract Management",Project Delivery,Projects - Closed,...,PD,PC_1,20YRaCS,GE_1,2012RTR,RTRS2012DS,no name,no name,no name,no name
1,A3133123,qA35736,xls,1,24 Projects - Full Projects (Close file when Project ends),Strategic Projects,00:55:08,"Procurement, Project Delivery and Contract Management",Project Delivery,Projects - Closed,...,PD,PC_1,20YRaCS,GE_1,2012RTR,RTRS2012DS,no name,no name,no name,no name
2,A3097046,qA35736,xls,4,24 Projects - Full Projects (Close file when Project ends),Strategic Projects,00:54:09,"Procurement, Project Delivery and Contract Management",Project Delivery,Projects - Closed,...,PD,PC_1,20YRaCS,GE_1,2012RTR,RTRS2012DS,no name,no name,no name,no name
3,A3113792,qA35736,xls,2,24 Projects - Full Projects (Close file when Project ends),Strategic Projects,00:52:54,"Procurement, Project Delivery and Contract Management",Project Delivery,Projects - Closed,...,PD,PC_1,20YRaCS,GE_1,2012RTR,RTRS2012DS,no name,no name,no name,no name
4,A3115138,qA35736,xls,2,24 Projects - Full Projects (Close file when Project ends),Strategic Projects,00:40:05,"Procurement, Project Delivery and Contract Management",Project Delivery,Projects - Closed,...,PD,PC_1,20YRaCS,GE_1,2012RTR,RTRS2012DS,no name,no name,no name,no name


In [9]:
print("check there are as many documents as unique documentid: %r, \n unique document ids: %i" % ((len(metadata_df.documentid.unique()) == len(metadata_df)),len(metadata_df.documentid.unique())))

check there are as many documents as unique documentid: True, 
 unique document ids: 118677


In [10]:
metadata_df['ret_schedule']= metadata_df['disposal_schedule'].apply(lambda x: x.split()[0] )

## Create sample

In [11]:
X_train, X_test, y_train, y_test = train_test_split(metadata_df[['documentid']],metadata_df[['ret_schedule']],random_state=0,test_size=0.2,stratify=metadata_df[['ret_schedule']])

In [12]:
train_df=pd.DataFrame(data=X_train,columns=['documentid'],index=X_train.index)
train_df['ret_schedule']=y_train

test_df=pd.DataFrame(data=X_test,columns=['documentid'],index=X_test.index)
test_df['ret_schedule']=y_test

### Check data validity

In [13]:
documentid=train_df.iloc[42]['documentid']
print(train_df[train_df.documentid==documentid])
print(metadata_df[['documentid','ret_schedule']][metadata_df.documentid==documentid])

       documentid ret_schedule
117643  A2762809   24         
       documentid ret_schedule
117643  A2762809   24         


In [14]:
documentid=test_df.iloc[42]['documentid']
print(test_df[test_df.documentid==documentid])
print(metadata_df[['documentid','ret_schedule']][metadata_df.documentid==documentid])

       documentid ret_schedule
114650  A3094893   21         
       documentid ret_schedule
114650  A3094893   21         


In [15]:
print("percentage of train set of total set: %1.2f, train amount: %i, test amount: %i" % (len(train_df)/(len(train_df)+len(test_df)),len(train_df),len(test_df)))


train_df_selected=train_df[train_df.ret_schedule.isin(['04', '06','15b','17','21','33','34','35','36'])]
test_df_selected=test_df[test_df.ret_schedule.isin(['04', '06','15b','17','21','33','34','35','36'])]
percentage=len(train_df_selected)/(len(train_df_selected)+len(test_df_selected))
print("percentage of selected train set of total set: %1.2f, train amount: %i, test amount: %i" % (percentage,len(train_df_selected),len(test_df_selected)))

percentage of train set of total set: 0.80, train amount: 94941, test amount: 23736
percentage of selected train set of total set: 0.80, train amount: 18620, test amount: 4655


## Export files

In [16]:
import os

# define the name of the directory to be created
path = "bin"

try:
    os.mkdir(path)
except OSError as e:
    print ("Creation of the directory %s failed for reason: %s" % (path, e.strerror))
else:
    print ("Successfully created the directory %s " % path)


Creation of the directory bin failed for reason: File exists


In [17]:
train_df.to_csv(path_or_buf="bin/sample_train.csv.gz",)
test_df.to_csv(path_or_buf="bin/sample_test.csv.gz")