In [138]:
# Import libraries for ml classificaiton
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [139]:
# Load train and test datasets
train = pd.read_csv('../CSV/train_set_it01.csv')
test = pd.read_csv('../CSV/test_set_it01.csv', sep=';')

In [140]:
# Check for missing values
print(train.isnull().sum())
print(test.isnull().sum())

# Drop columns with missing values (JournalType, JournalTypeLabel, NumOfLines, CountLedgerJournal)
train = train.drop(['JournalType', 'JournalTypeLabel', 'NumOfLines', 'CountLedgerJournal'], axis=1)
test = test.drop(['JournalType', 'JournalTypeLabel', 'NumOfLines', 'CountLedgerJournal'], axis=1)

GjaeRecId                        0
LedgerDimension                  0
GroupChartOfAccountsValue        0
Ledger                           0
PostingLayer                     0
SubledgerVoucher                 0
CreatedBy                        0
MainAccount                      0
Text                             0
JournalType                  10906
JournalTypeLabel             10906
NumOfLines                   10906
CountLedgerJournal           10906
TransactionCurrencyAmount        0
TransactionCurrencyCode          0
Activity                         0
dtype: int64
GjaeRecId                        0
LedgerDimension                  0
GroupChartOfAccountsValue        0
Ledger                           0
PostingLayer                     0
SubledgerVoucher                 0
CreatedBy                        0
MainAccount                      0
Text                            27
JournalType                  38799
JournalTypeLabel             38799
NumOfLines                   38799
CountLe

In [141]:
# Drop rows where column Text is null in test set
test = test.dropna(subset=['Text'])

In [142]:
# Show the number of observations and features
print(train.shape)
print(test.shape)

(64000, 12)
(38944, 11)


In [143]:
# Check unique values for each column
print(train.nunique())
print(test.nunique())

GjaeRecId                    63959
LedgerDimension               5605
GroupChartOfAccountsValue      162
Ledger                           1
PostingLayer                     3
SubledgerVoucher             40362
CreatedBy                       25
MainAccount                    366
Text                         37375
TransactionCurrencyAmount    34556
TransactionCurrencyCode          1
Activity                         8
dtype: int64
GjaeRecId                    38944
LedgerDimension               4610
GroupChartOfAccountsValue      135
Ledger                           1
PostingLayer                     3
SubledgerVoucher              5418
CreatedBy                        8
MainAccount                    292
Text                          5373
TransactionCurrencyAmount    20526
TransactionCurrencyCode          1
dtype: int64


In [144]:
# Print values of Activity columm
print(train['Activity'].value_counts())

Activity
ACT01    8000
ACT03    8000
ACT07    8000
ACT09    8000
ACT13    8000
ACT15    8000
ACT17    8000
ACT32    8000
Name: count, dtype: int64


In [145]:
# Drop column Ledger and TransactionCurrencyCode
train = train.drop(['GjaeRecId', 'Ledger', 'SubledgerVoucher', 'TransactionCurrencyCode', 'TransactionCurrencyAmount'], axis=1)
test = test.drop(['Ledger', 'SubledgerVoucher', 'TransactionCurrencyCode', 'TransactionCurrencyAmount'], axis=1)
                       

In [146]:
# Show the number of observations and features
print(train.shape)
print(test.shape)
print(train.head())

(64000, 7)
(38944, 7)
   LedgerDimension GroupChartOfAccountsValue  PostingLayer   
0       5637173888                    A10420             0  \
1       5645293726                  E41040IC             0   
2       5637830841                  E41040IC             0   
3       5637175101                    L20200             0   
4       5644324475                    L20200             0   

              CreatedBy  MainAccount   
0  giovanna.santostefan   5637145546  \
1     annamaria.massara   5637145765   
2  giovanna.santostefan   5637145765   
3  giovanna.santostefan   5637145525   
4  giovanna.santostefan   5637145525   

                                                Text Activity  
0               TERRATURISMO - ospitalità AMEX 02/18    ACT01  
1  LEASEPLAN - CANONI SERVIZI AUTO GALLO  GF247CD...    ACT01  
2  LEASEPLAN - Canoni Leasing Auto VILLA EX822AD ...    ACT01  
3                        MANPOWER - interinali 05/18    ACT01  
4                            OSCAR - TAXI 05

In [147]:
# Encode GroupChartOfAccountsValue column to numeric values 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
train['GroupChartOfAccountsValue'] = le.fit_transform(train['GroupChartOfAccountsValue'])
test['GroupChartOfAccountsValue'] = le.fit_transform(test['GroupChartOfAccountsValue'])

# Encode Activity column to numeric values
le = LabelEncoder()
train['Activity'] = le.fit_transform(train['Activity'])

# Encode CreatedBy column to numeric values 
le = LabelEncoder()
train['CreatedBy'] = le.fit_transform(train['CreatedBy'])
test['CreatedBy'] = le.fit_transform(test['CreatedBy'])

# Encode MainAccount column to numeric values
le = LabelEncoder()
train['MainAccount'] = le.fit_transform(train['MainAccount'])
test['MainAccount'] = le.fit_transform(test['MainAccount'])


print(train.head())


   LedgerDimension  GroupChartOfAccountsValue  PostingLayer  CreatedBy   
0       5637173888                          8             0         12  \
1       5645293726                         58             0          4   
2       5637830841                         58             0         12   
3       5637175101                        125             0         12   
4       5644324475                        125             0         12   

   MainAccount                                               Text  Activity  
0           85               TERRATURISMO - ospitalità AMEX 02/18         0  
1          250  LEASEPLAN - CANONI SERVIZI AUTO GALLO  GF247CD...         0  
2          250  LEASEPLAN - Canoni Leasing Auto VILLA EX822AD ...         0  
3           72                        MANPOWER - interinali 05/18         0  
4           72                            OSCAR - TAXI 05-06/2022         0  


In [148]:
# mescolare i dati
train = train.sample(frac=1).reset_index(drop=True)

print(train.head())
print(test.head())


   LedgerDimension  GroupChartOfAccountsValue  PostingLayer  CreatedBy   
0       5637176289                          1             0         16  \
1       5637176287                          2             0         16   
2       5645331199                        110             0          4   
3       5645331216                        110             0          4   
4       5637176125                         56             0         17   

   MainAccount                            Text  Activity  
0           55             comm. bancarie SOGE         4  
1           30  AMAZON FT 2/446 DEL 12.06.2020         3  
2          293      24009725, Spit fixed asset         6  
3          289      36009208, Spit fixed asset         6  
4          268              Ferie Staff  07/17         7  
    GjaeRecId  LedgerDimension  GroupChartOfAccountsValue  PostingLayer   
0  5741823175       5647352062                         32             0  \
1  5669250005       5642216682                     

In [149]:
# Remove numbers from the rows in the 'Text' column of the train dataset
train['Text'] = train['Text'].str.replace('\d+', '')

# Remove numbers from the rows in the 'Text' column of the test dataset
test['Text'] = test['Text'].str.replace('\d+', '')

In [150]:
# Remove punctuation and convert text to lowercase in the 'Text' column of 'train'
train['Text'] = train['Text'].str.replace('[^\w\s]','').str.lower()

# Remove punctuation and convert text to lowercase in the 'Text' column of 'test'
test['Text'] = test['Text'].str.replace('[^\w\s]','').str.lower()

In [151]:
# Create a TF-IDF vectorizer to convert the text of the law into a matrix of TF-IDF features
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))

# Fit and transform the TF-IDF vectorizer on the 'Text' column in 'train'
tfidf_train = vectorizer.fit_transform(train['Text'])

# Transform the 'Text' column in 'test'
tfidf_test = vectorizer.transform(test['Text'])


In [152]:
# Split the train dataset into train and validation sets 
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tfidf_train, train['Activity'], test_size=0.2, random_state=42)

# Import the Multinomial Naive Bayes classifier
from sklearn.naive_bayes import MultinomialNB

# Instantiate the MultinomialNB classifier
nb_classifier = MultinomialNB()

# Fit the classifier to the training data
nb_classifier.fit(X_train, y_train)

# Create the predicted tags: pred
pred = nb_classifier.predict(X_val)


from sklearn import metrics
# Calculate the accuracy score: score
score = metrics.accuracy_score(y_val, pred)
print(score)

# Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_val, pred, labels=[0, 1])
print(cm)



0.81015625
[[1544   36]
 [ 143 1136]]


In [153]:
# Train the model a Neural Network (MLPClassifier)
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(hidden_layer_sizes=(100, 50), random_state=42, verbose=True)
mlp.fit(X_train, y_train)

# Predict the labels of the test set: y_pred
y_pred = mlp.predict(X_val)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_val, y_pred)
print(score)



Iteration 1, loss = 0.96567701
Iteration 2, loss = 0.29933296
Iteration 3, loss = 0.20121142
Iteration 4, loss = 0.17481460
Iteration 5, loss = 0.16225972
Iteration 6, loss = 0.15506094
Iteration 7, loss = 0.14968927
Iteration 8, loss = 0.14615939
Iteration 9, loss = 0.14329345
Iteration 10, loss = 0.14007001
Iteration 11, loss = 0.13840331
Iteration 12, loss = 0.13632185
Iteration 13, loss = 0.13455399
Iteration 14, loss = 0.13335140
Iteration 15, loss = 0.13210263
Iteration 16, loss = 0.13081109
Iteration 17, loss = 0.12999120
Iteration 18, loss = 0.12845974
