In [1]:
# Import pandas
import pandas as pd

In [2]:
# Load train and test datasets
train = pd.read_csv('../CSV/train_set.csv', sep=';')
print(train.shape)

test = pd.read_csv('../CSV/test_set.csv', sep=';')
print(test.shape)

(6074943, 7)
(2444386, 6)


In [3]:
# Print name of columns in datasets 
print(train.columns)
print(test.columns)

Index(['GjaeRecId', 'GroupChartOfAccountsValue', 'PostingLayer',
       'SubledgerVoucherDataAreaId', 'CreatedBy', 'Text', 'Id'],
      dtype='object')
Index(['5637145810', 'A11115B', '0', 'RO01', 'Admin',
       'FXA MIGRATION 31/07/17'],
      dtype='object')


In [4]:
# Give this name columns to test set: 'GjaeRecId', 'GroupChartOfAccountsValue', 'PostingLayer', 'SubledgerVoucherDataAreaId', 'CreatedBy', 'Text'
test.columns = ['GjaeRecId', 'GroupChartOfAccountsValue', 'PostingLayer', 'SubledgerVoucherDataAreaId', 'CreatedBy', 'Text']
train.columns = ['GjaeRecId', 'GroupChartOfAccountsValue', 'PostingLayer', 'SubledgerVoucherDataAreaId', 'CreatedBy', 'Text', 'Activity']

# Print name of columns in datasets
print(test.columns)
print(test.shape)
print(train.columns)
print(train.shape)

Index(['GjaeRecId', 'GroupChartOfAccountsValue', 'PostingLayer',
       'SubledgerVoucherDataAreaId', 'CreatedBy', 'Text'],
      dtype='object')
(2444386, 6)
Index(['GjaeRecId', 'GroupChartOfAccountsValue', 'PostingLayer',
       'SubledgerVoucherDataAreaId', 'CreatedBy', 'Text', 'Activity'],
      dtype='object')
(6074943, 7)


In [5]:
print(train['Activity'].value_counts())
print(train.shape)

Activity
ACT17    600000
ACT44    600000
ACT32    600000
ACT09    600000
ACT15    600000
ACT07    600000
ACT04    600000
ACT03    600000
ACT01    600000
ACT13    561829
ACT34     67410
ACT35     23273
ACT29     20429
ACT16      2002
Name: count, dtype: int64
(6074943, 7)


In [6]:
# Drop rows of train set with Activity = ACT34 or ACT35 or ACT29 or ACT16 
train = train[train.Activity != 'ACT34']
train = train[train.Activity != 'ACT35']
train = train[train.Activity != 'ACT29']
train = train[train.Activity != 'ACT16']

print(train['Activity'].value_counts())
print(train.shape)

Activity
ACT17    600000
ACT44    600000
ACT32    600000
ACT09    600000
ACT15    600000
ACT07    600000
ACT04    600000
ACT03    600000
ACT01    600000
ACT13    561829
Name: count, dtype: int64
(5961829, 7)


In [7]:
# Encode GroupChartOfAccountsValue column to numeric values 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['GroupChartOfAccountsValue'] = le.fit_transform(train['GroupChartOfAccountsValue'])
test['GroupChartOfAccountsValue'] = le.fit_transform(test['GroupChartOfAccountsValue'])

# Encode Activity column to numeric values
le = LabelEncoder()
train['Activity'] = le.fit_transform(train['Activity'])

# Encode CreatedBy column to numeric values 
le = LabelEncoder()
train['CreatedBy'] = le.fit_transform(train['CreatedBy'])
test['CreatedBy'] = le.fit_transform(test['CreatedBy'])

# Encode SubledgerVoucherDataAreaId column to numeric values
le = LabelEncoder()
train['SubledgerVoucherDataAreaId'] = le.fit_transform(train['SubledgerVoucherDataAreaId'])
test['SubledgerVoucherDataAreaId'] = le.fit_transform(test['SubledgerVoucherDataAreaId'])

# Print head of train set 
print(train.head())

    GjaeRecId  GroupChartOfAccountsValue  PostingLayer   
0  5649334825                        170             0  \
1  5710479632                        203             0   
2  5647433945                        194             0   
3  5637301235                        276             0   
4  5645359931                        168             2   

   SubledgerVoucherDataAreaId  CreatedBy                             Text   
0                          61        274  Amortissement depuis 31/05/2019  \
1                         106        216                 MALİ YIL AÇILIŞI   
2                         105        237                MALİ YIL KAPANIŞI   
3                          72        244  0100003317 -LYRECO ITALIA S.r.l   
4                          67        147            Depr. since 31/3/2019   

   Activity  
0         7  
1         9  
2         9  
3         8  
4         7  


In [8]:
# Print values of Activity columm
print(train['Activity'].value_counts())

Activity
7    600000
9    600000
8    600000
4    600000
6    600000
3    600000
2    600000
1    600000
0    600000
5    561829
Name: count, dtype: int64


In [9]:
# Data mixing
train = train.sample(frac=1).reset_index(drop=True)

print(train.head())
print(test.head())

    GjaeRecId  GroupChartOfAccountsValue  PostingLayer   
0  5638693444                          1             0  \
1  5645683674                        178             0   
2  5649471234                        194             0   
3  5681568472                        235             0   
4  5642402883                          1             0   

   SubledgerVoucherDataAreaId  CreatedBy   
0                          77        324  \
1                          42         56   
2                          62        324   
3                          63        179   
4                          28         96   

                                                Text  Activity  
0                           allimentation compte mad         5  
1                       31/12/2018 / KUR DEĞERLEMESİ         4  
2                                          PI / 3487         1  
3  Sales invoice - TPUK-INV000002069, Customer - ...         3  
4                                         up 11.2018         

In [10]:
# Remove numbers from the rows in the 'Text' column of the train dataset
train['Text'] = train['Text'].str.replace('\d+', '')

# Remove numbers from the rows in the 'Text' column of the test dataset
test['Text'] = test['Text'].str.replace('\d+', '')

In [11]:
# Remove punctuation and convert text to lowercase in the 'Text' column of 'train'
train['Text'] = train['Text'].str.replace('[^\w\s]','').str.lower()

# Remove punctuation and convert text to lowercase in the 'Text' column of 'test'
test['Text'] = test['Text'].str.replace('[^\w\s]','').str.lower()

In [12]:
# Print text of first 5 rows of train set
print(train['Text'].head())

0                             allimentation compte mad
1                        31/12/2018 / kur değerlemesi̇
2                                            pi / 3487
3    sales invoice - tpuk-inv000002069, customer - ...
4                                           up 11.2018
Name: Text, dtype: object


In [13]:
# Remove GjaeRecId column from test set and save it in a variable
train = train.drop(['GjaeRecId'], axis=1)
GjaeRecId = test['GjaeRecId']
test = test.drop(['GjaeRecId'], axis=1)

print(train.shape)
print(test.shape)

(5961829, 6)
(2444386, 5)


In [14]:
# import re

# def remove_digits(text):
#     if isinstance(text, str):
#         return re.sub(r'\b\d+\b', '', text)
#     else:
#         return text

# # Applica la funzione sulla colonna 'Text' nel dataset di addestramento
# train['Text'] = train['Text'].apply(remove_digits)

# # Applica la funzione sulla colonna 'Text' nel dataset di test
# test['Text'] = test['Text'].apply(remove_digits)


# import string

# # Rimuovi punteggiatura e converti il testo in minuscolo nella colonna 'Text' di 'train'
# train['Text'] = train['Text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower() if isinstance(x, str) else x)

# # Rimuovi punteggiatura e converti il testo in minuscolo nella colonna 'Text' di 'test'
# test['Text'] = test['Text'].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)).lower() if isinstance(x, str) else x)

In [15]:
# Sostituisci i valori NaN con una stringa vuota nella colonna 'Text' di 'train'
train['Text'] = train['Text'].fillna('')

# Sostituisci i valori NaN con una stringa vuota nella colonna 'Text' di 'test'
test['Text'] = test['Text'].fillna('')


In [16]:
# Create a TF-IDF vectorizer to convert the text of the law into a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the TF-IDF vectorizer on the 'Text' column in 'train'
tfidf_train = vectorizer.fit_transform(train['Text'])

# Transform the 'Text' column in 'test'
tfidf_test = vectorizer.transform(test['Text'])


In [17]:
# Split the train dataset into train and validation sets 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tfidf_train, train['Activity'], test_size=0.2, random_state=42)

# Import the Multinomial Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

# Instantiate the MultinomialNB classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(X_val)


from sklearn import metrics
# Compute and print metrics
print("Accuracy: {}".format(metrics.accuracy_score(y_val, pred)))
print("Classification Report:\n {}".format(metrics.classification_report(y_val, pred)))


# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_val, pred, labels=[0, 1])
print(cm)



Accuracy: 0.8423957073583112
Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.96      0.93    120302
           1       0.82      0.78      0.80    119658
           2       0.99      0.90      0.94    119792
           3       0.91      0.91      0.91    119860
           4       0.73      0.80      0.76    119637
           5       0.84      0.87      0.86    112574
           6       0.87      0.39      0.54    120011
           7       0.61      0.94      0.74    120745
           8       0.93      0.87      0.90    120128
           9       1.00      1.00      1.00    119659

    accuracy                           0.84   1192366
   macro avg       0.86      0.84      0.84   1192366
weighted avg       0.86      0.84      0.84   1192366

[[115415   2072]
 [  6921  93396]]


In [18]:
# Import random forest classifier
from sklearn.ensemble import RandomForestClassifier

# Instantiate the classifier
rfc = RandomForestClassifier(random_state=42, n_jobs = 4)

# Fit to the training data
rfc.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = rfc.predict(X_val)

# Compute and print metrics
print("Accuracy: {}".format(metrics.accuracy_score(y_val, y_pred)))
print("Classification Report:\n {}".format(metrics.classification_report(y_val, y_pred)))

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_val, y_pred, labels=[0, 1])
print(cm)

In [None]:
# # Model selection for the logistic regression classifier using GridSearchCV 
# from sklearn.linear_model import LogisticRegression
# from sklearn.model_selection import GridSearchCV

# # Setup the hyperparameter grid
# # c_space = np.logspace(-5, 8, 15)
# param_grid = {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}
# logreg = LogisticRegression(random_state=42, max_iter=100, verbose=2)

# # Instantiate the GridSearchCV object
# logreg_cv = GridSearchCV(logreg, param_grid, cv=5)

# # Fit it to the training data
# logreg_cv.fit(X_train, y_train)

# # Print the optimal parameters and best score
# print("Tuned Logistic Regression Parameter: {}".format(logreg_cv.best_params_))
# print("Tuned Logistic Regression Accuracy: {}".format(logreg_cv.best_score_))