## Romanian Poetry Classification

### Data Exploratory

In [1]:
import numpy as np 
import pandas as pd

My data has no header. By default pandas looks in the first row for the header. I have to tell pandas that my data has no header.

In [2]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")


#test = pd.read_csv("test.txt", header = None)
# sample_submission = pd.read_csv("/content/sample_submission.txt")

In [3]:
train

Unnamed: 0,Id,Versuri,Autor
0,gtGAdRcefKLbdSnHeADXAP,Frunzare se boltesc adânci\npeste o-ntreagă po...,Lucian Blaga
1,Vmy4wa4jueqTNxhMwJN8Z4,"<font size=""-1"">Unui gazetar care cerea să fiu...",George Toparceanu
2,Hp7rQvwXLjWcZhSYXDsAJU,Acum te odihneşte gustând eterna pace\nÎn tain...,Vasile Alecsandri
3,FREBhCLk9Urz6YnBzuAe75,"Şi mi-i ghioaga pintuită,\nŞi mi-i inima-ncolţ...",Vasile Alecsandri
4,jXpfLZWB4UXxijS7KxJe9X,"Hohot de smintit.\nNici o urmă despre tine,\n-...",George Bacovia
...,...,...,...
4264,DMCwENrHo94fpyWx6EWYoV,"Lupta urlă, se-ncleştează\nŞi barbarii toţi gr...",Vasile Alecsandri
4265,TMwAUCWTYhFmQLFtmSsHNx,Şi fiindu-şi sie dragă cum nu-i este nime-n lu...,Mihai Eminescu
4266,jRBcajEAaetyxdQ8FxHwDK,\nBătrânul Dan ascultă grăind doi vechi stejar...,Vasile Alecsandri
4267,aiXNgaLRiKhbY4va5uK4gf,"de la oameni la albine,\nde la-nvingători la b...",Lucian Blaga


#### Derived text features, tweet-based (i.e. simple text features)

In [4]:
def add_derived_features(df, text_source):
    #token count
    df['token_cnt'] = df[ text_source ].apply(lambda x: len(str(x).split(' ')) if ((x is not None) and (x!=np.nan)) else 0)

    # character count
    df['line_size'] = df[ text_source ].apply(lambda x: len(str(x)         )   )  

    # non empty characters
    df['empty_char_cnt'] = df[ text_source ].apply(lambda x: len([c for c in str(x) if c==' '])   )

    # non empty characters
    df['non_empty_char_cnt'] = df[ text_source ].apply(lambda x: len([c for c in str(x) if c!=' '])   )

    #letters only
    df['letter_cnt'] = df[ text_source ].apply(lambda x: len([c for c in str(x) if c.isalpha()])   )

    #special characters only
    df['special_char_cnt'] = df[ text_source ].apply(lambda x: len([c for c in str(x) if (not c.isalnum()) ])   )

    #digits only
    df['digits_cnt'] = df[ text_source ].apply(lambda x: len([c for c in str(x) if c.isnumeric()])   )

    #relative counts
    # non empty characters
    df['empty_char_prct'] = df['empty_char_cnt'] /df['line_size']

    # non empty characters
    df['non_empty_char_prct'] = df['non_empty_char_cnt']/df['line_size']

    #letters only
    df['letter_prct'] = df['letter_cnt'] /df['line_size']

    #special characters only
    df['special_char_prct'] = df['special_char_cnt'] /df['line_size']

    #digits only
    df['digits_prct'] = df['digits_cnt']/df['line_size']


    # catergories absolute count
    df['is_alpha_cnt'] =df[ text_source ].apply(lambda x: len([t for t in str(x).split(' ') if t.isalpha() ] ))
    df['is_mixed_alnum_cnt'] =df[ text_source ].apply(lambda x: len([t for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())] ))
    df['is_numeric_cnt'] =df[ text_source ].apply(lambda x: len([t for t in str(x).split(' ') if t.isnumeric() ] ))
    df['is_non_alnum_cnt'] =df[ text_source ].apply(lambda x: len( str(x).split(' ')  )-\
                                            len([t for t in str(x).split(' ') if t.isalnum() ] ))
    # categories relative count
    df['is_alpha_prct'] =df['is_alpha_cnt']/df['token_cnt']
    df['is_numeric_prct'] =df['is_numeric_cnt']/df['token_cnt']
    df['is_mixed_alnum_prct'] =df['is_mixed_alnum_cnt']/df['token_cnt']
    df['is_non_alnum_prct'] =df['is_non_alnum_cnt']/df['token_cnt']

    for c in ['is_alpha_prct','is_numeric_prct','is_mixed_alnum_prct','is_non_alnum_prct']:
        df[c] = df[c].replace(np.inf, 2)
        df[c] = df[c].replace(np.nan, -1)
    # max and min token length by category; mean and std for lengths

    # max
    df['is_alpha_max_len'] =df[ text_source ].apply(lambda x: max([len(t) for t in str(x).split(' ') if t.isalpha() ] ) if\
                                                len([t for t in str(x).split(' ') if t.isalpha()])>0 else -1)
    df['is_numeric_max_len'] =df[ text_source ].apply(lambda x: max([len(t) for t in str(x).split(' ') if t.isnumeric() ] )if\
                                                len([t for t in str(x).split(' ') if t.isnumeric()])>0 else -1)
    df['is_mixed_alnum_max_len'] =df[ text_source ].apply(lambda x: max([len(t) for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())] )if\
                                                len([t for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())])>0 else -1)

    df['is_non_alnum_max_len'] =df[ text_source ].apply(lambda x: max([len(t) for t in str(x).split(' ') if (not t.isalnum())] )if\
                                                len([t for t in str(x).split(' ') if (not t.isalnum())])>0 else -1)

    # min
    df['is_alpha_min_len'] =df[ text_source ].apply(lambda x: min([len(t) for t in str(x).split(' ') if t.isalpha() ] ) if\
                                                len([t for t in str(x).split(' ') if t.isalpha()])>0 else -1)
    df['is_numeric_min_len'] =df[ text_source ].apply(lambda x: min([len(t) for t in str(x).split(' ') if t.isnumeric() ] )if\
                                                len([t for t in str(x).split(' ') if t.isnumeric()])>0 else -1)
    df['is_mixed_alnum_min_len'] =df[ text_source ].apply(lambda x: min([len(t) for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())] )if\
                                                len([t for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())])>0 else -1)

    df['is_non_alnum_min_len'] =df[ text_source ].apply(lambda x: min([len(t) for t in str(x).split(' ') if (not t.isalnum())] )if\
                                                len([t for t in str(x).split(' ') if (not t.isalnum())])>0 else -1)

    # avg len
    df['is_alpha_avg_len'] =df[ text_source ].apply(lambda x: np.mean([len(t) for t in str(x).split(' ') if t.isalpha() ] ) if\
                                                len([t for t in str(x).split(' ') if t.isalpha()])>0 else -1)
    df['is_numeric_avg_len'] =df[ text_source ].apply(lambda x: np.mean([len(t) for t in str(x).split(' ') if t.isnumeric() ] )if\
                                                len([t for t in str(x).split(' ') if t.isnumeric()])>0 else -1)
    df['is_mixed_alnum_avg_len'] =df[ text_source ].apply(lambda x: np.mean([len(t) for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())] )if\
                                                len([t for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())])>0 else -1)

    df['is_non_alnum_avg_len'] =df[ text_source ].apply(lambda x: np.mean([len(t) for t in str(x).split(' ') if (not t.isalnum())] )if\
                                                len([t for t in str(x).split(' ') if (not t.isalnum())])>0 else -1)

    # std for len
    df['is_alpha_std_len'] =df[ text_source ].apply(lambda x: np.std([len(t) for t in str(x).split(' ') if t.isalpha() ] ) if\
                                                len([t for t in str(x).split(' ') if t.isalpha()])>0 else -1)
    df['is_numeric_std_len'] =df[ text_source ].apply(lambda x: np.std([len(t) for t in str(x).split(' ') if t.isnumeric() ] )if\
                                                len([t for t in str(x).split(' ') if t.isnumeric()])>0 else -1)
    df['is_mixed_alnum_std_len'] =df[ text_source ].apply(lambda x: np.std([len(t) for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())] )if\
                                                len([t for t in str(x).split(' ') if t.isalnum() and\
                                                                     (not t.isalpha()) and (not t.isnumeric())])>0 else -1)

    df['is_non_alnum_std_len'] =df[ text_source ].apply(lambda x: np.std([len(t) for t in str(x).split(' ') if (not t.isalnum())] )if\
                                                len([t for t in str(x).split(' ') if (not t.isalnum())])>0 else -1)

add_derived_features(train, 'Versuri')

In [5]:
train.head()

Unnamed: 0,Id,Versuri,Autor,token_cnt,line_size,empty_char_cnt,non_empty_char_cnt,letter_cnt,special_char_cnt,digits_cnt,...,is_mixed_alnum_min_len,is_non_alnum_min_len,is_alpha_avg_len,is_numeric_avg_len,is_mixed_alnum_avg_len,is_non_alnum_avg_len,is_alpha_std_len,is_numeric_std_len,is_mixed_alnum_std_len,is_non_alnum_std_len
0,gtGAdRcefKLbdSnHeADXAP,Frunzare se boltesc adânci\npeste o-ntreagă po...,Lucian Blaga,18,128,17,111,105,23,0,...,-1,9,4.642857,-1.0,-1,11.5,1.98592,-1.0,-1,1.802776
1,Vmy4wa4jueqTNxhMwJN8Z4,"<font size=""-1"">Unui gazetar care cerea să fiu...",George Toparceanu,21,155,20,135,114,40,1,...,-1,5,3.615385,-1.0,-1,11.0,1.982169,-1.0,-1,5.545268
2,Hp7rQvwXLjWcZhSYXDsAJU,Acum te odihneşte gustând eterna pace\nÎn tain...,Vasile Alecsandri,25,167,24,143,131,36,0,...,-1,3,4.176471,-1.0,-1,9.0,2.617482,-1.0,-1,3.535534
3,FREBhCLk9Urz6YnBzuAe75,"Şi mi-i ghioaga pintuită,\nŞi mi-i inima-ncolţ...",Vasile Alecsandri,12,102,11,91,78,24,0,...,-1,4,3.833333,-1.0,-1,11.333333,1.950783,-1.0,-1,7.086764
4,jXpfLZWB4UXxijS7KxJe9X,"Hohot de smintit.\nNici o urmă despre tine,\n-...",George Bacovia,12,86,11,75,60,26,0,...,-1,5,4.0,-1.0,-1,9.4,2.329929,-1.0,-1,2.727636


In [6]:
train.shape

(4269, 39)

In [7]:
labels = train['Autor'].unique().tolist()
train['Autor'] = train['Autor'].apply(lambda x:labels.index(x))
train.head()

Unnamed: 0,Id,Versuri,Autor,token_cnt,line_size,empty_char_cnt,non_empty_char_cnt,letter_cnt,special_char_cnt,digits_cnt,...,is_mixed_alnum_min_len,is_non_alnum_min_len,is_alpha_avg_len,is_numeric_avg_len,is_mixed_alnum_avg_len,is_non_alnum_avg_len,is_alpha_std_len,is_numeric_std_len,is_mixed_alnum_std_len,is_non_alnum_std_len
0,gtGAdRcefKLbdSnHeADXAP,Frunzare se boltesc adânci\npeste o-ntreagă po...,0,18,128,17,111,105,23,0,...,-1,9,4.642857,-1.0,-1,11.5,1.98592,-1.0,-1,1.802776
1,Vmy4wa4jueqTNxhMwJN8Z4,"<font size=""-1"">Unui gazetar care cerea să fiu...",1,21,155,20,135,114,40,1,...,-1,5,3.615385,-1.0,-1,11.0,1.982169,-1.0,-1,5.545268
2,Hp7rQvwXLjWcZhSYXDsAJU,Acum te odihneşte gustând eterna pace\nÎn tain...,2,25,167,24,143,131,36,0,...,-1,3,4.176471,-1.0,-1,9.0,2.617482,-1.0,-1,3.535534
3,FREBhCLk9Urz6YnBzuAe75,"Şi mi-i ghioaga pintuită,\nŞi mi-i inima-ncolţ...",2,12,102,11,91,78,24,0,...,-1,4,3.833333,-1.0,-1,11.333333,1.950783,-1.0,-1,7.086764
4,jXpfLZWB4UXxijS7KxJe9X,"Hohot de smintit.\nNici o urmă despre tine,\n-...",3,12,86,11,75,60,26,0,...,-1,5,4.0,-1.0,-1,9.4,2.329929,-1.0,-1,2.727636


In [8]:
train_cols = [c for c in train.columns if c not in ['Id','Autor']]

### Data Modelling 

Split multiple times in train and test
For now I split once, setting the random_state so that if I want to work later everything is traceable.

Once the model is found, I repeat this split and the training say 100 times, with random_state eliminated so that the splits are independent and not repeatable.

In [9]:
from sklearn.model_selection import train_test_split



X_train, X_test, y_train, y_test = train_test_split(train[train_cols], train[['Autor']], test_size=0.2, 
                                                    random_state=7) # this will have to stay loose if we want to properly
                                                                     # evaluate the model performance and split several time
print(train.shape[1], X_train.shape[1],X_test.shape[1])

39 37 37


#### Part of feature engineering I need to add text features

In [10]:
# !pip install nltk
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Alexa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
from nltk.corpus import stopwords
german_stop_words = stopwords.words('romanian')
import nltk


#from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features =5000, # max words for 'vocabulary' - to run this eg. 
                             max_df = 0.3, # max frequency for words in vocabulary 
                             min_df = 0.001, # min frequency for words in vocabulary
                             token_pattern  = '[a-zA-Z]{3,20}', #just look for words at least 3 char long and no more than 20
                             smooth_idf = True, 
                             #binary = True,
                             stop_words = german_stop_words # german stop words loaded - not sure how efficient this is but worth trying
                                  )

# applies the fit_transform on the training set and the transform on the test set
def add_text_features(df_train,df_test):

    X_train_tfidf = vectorizer.fit_transform(df_train['Versuri'])
     
    print("Train Vectorized Shape: ", X_train_tfidf.shape)

    X_test_tfidf = vectorizer.transform(df_test['Versuri'])
    print("Test Vectorized Shape: ", X_test_tfidf.shape)
    
    #### Adding sparse data series from vectorization as columns in dataframe
    for i, col in enumerate(vectorizer.get_feature_names()):
        
        #x = pd.Series(pd.array.Sparse(X_train_tfidf[:, i].toarray().ravel()))
        if col in df_train.columns:
            df_train.drop(columns = [col],inplace = True)
        if col in df_test.columns:
            df_test.drop(columns = [col],inplace = True)
        df_train.loc[:,col] = X_train_tfidf[:, i].toarray()
        df_test[col] = X_test_tfidf[:, i].toarray()

This is the final thing to be used - but first cross validation; inside the cross validation need to add features to splits
the same need to be done when training on this below...


In [12]:
X = X_train[[c for c in X_train.columns if c !='Versuri']].to_numpy()
y= y_train.to_numpy()

X_eval = X_test[[c for c in X_test.columns if c !='Versuri']].to_numpy()
y_eval = y_test.to_numpy()

### Fitting Random Forest Regression to the dataset

Overfitting is controlled by max_samples.

I can play with the word freq and max samples to tune the algorithm. the estimators might not play a big role, although when max_samples is small, than up to some point we want to add estimators in order to explore the entire data set with as many disjoint opinions (from estimators for which the training set is not overlapping)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [14]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestClassifier(n_estimators =10000, 
                                  criterion = 'gini',
                                  max_samples =0.1,
                                  n_jobs = -1,
                                  random_state = 0)

In [15]:
import copy

# make a hard copy of the X_train to avoid messing up with the cross-validation if a cross-validation was used
X_train_orig = copy.deepcopy(X_train)
X_test_orig = copy.deepcopy(X_test)

# add text features (vectorization) - that is prepare the train and test features
add_text_features(X_train_orig,X_test_orig)

# convert to numpy arrays
X = X_train_orig[[c for c in X_train_orig.columns if c !='Versuri']].to_numpy()
y= y_train.to_numpy()

X_eval = X_test_orig[[c for c in X_test_orig.columns if c !='Versuri']].to_numpy()
y_eval = y_test.to_numpy()



Train Vectorized Shape:  (3415, 1994)
Test Vectorized Shape:  (854, 1994)


In [16]:
%%time
# set up the regressor and fit on big training set
regressor.max_features =int(round(X.shape[1]*0.33,0))
regressor.fit(X,y)

# predict on train and test
# y_hat_fit = regressor.predict(X)
y_hat_eval = regressor.predict(X_eval)



Wall time: 39.2 s


In [17]:
from sklearn import metrics
print(metrics.classification_report(y_hat_eval, y_eval))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.01      0.20      0.02         5
           2       0.38      0.44      0.40       124
           3       0.00      0.00      0.00         1
           4       0.77      0.40      0.53       463
           5       0.60      0.51      0.55       219
           6       0.13      0.28      0.18        39

    accuracy                           0.43       854
   macro avg       0.27      0.26      0.24       854
weighted avg       0.63      0.43      0.49       854



### Submission part 

Since there is a submission sample, I will try to respect the format

In [18]:
test.head()

Unnamed: 0,Id,Versuri
0,asPpWqsLJfmiX8BiEf3gmm,"Sus, pe dealuri, Toamna pune\nMirişti galbene-..."
1,oJWxeVPj4QVvqeaqYcuQJK,"Te sărut şi eu şi Luna,\nIzvoraşule.\nCă eşti ..."
2,VfrePwDMD4cCausKZ5X5cf,"Ah! viaţa pentru mine,\nScump înger! fără tine..."
3,857QzB65ijZQGaoSjEaEnn,"Cum aş zidi un val,\nA doua zi iar,\nA treia z..."
4,cZSmeqnMgw56oadzt3EfAz,"Cu creionul dus la gură,\nNecăjit fără măsură,..."


In [19]:
ids = test.Id
test = test[['Versuri']]

Also need to apply the same feature engineering to the real unseen data. 

In [20]:
add_derived_features(test, 'Versuri')

In [21]:
X_test_tfidf_real = vectorizer.transform(test['Versuri'])
print("Test Vectorized Shape: ", X_test_tfidf_real.shape)

Test Vectorized Shape:  (1068, 1994)


In [22]:
for i, col in enumerate(vectorizer.get_feature_names()):
        
        if col in test.columns:
            test.drop(columns = [col],inplace = True)
        test[col] = X_test_tfidf_real[:, i].toarray()

In [23]:
test.head(5)

Unnamed: 0,Versuri,token_cnt,line_size,empty_char_cnt,non_empty_char_cnt,letter_cnt,special_char_cnt,digits_cnt,empty_char_prct,non_empty_char_prct,...,zmeu,zne,znesc,zori,zorii,zugr,zui,zut,zute,zvon
0,"Sus, pe dealuri, Toamna pune\nMirişti galbene-...",13,98,12,86,78,20,0,0.122449,0.877551,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"Te sărut şi eu şi Luna,\nIzvoraşule.\nCă eşti ...",11,75,10,65,58,17,0,0.133333,0.866667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,"Ah! viaţa pentru mine,\nScump înger! fără tine...",15,86,14,72,64,22,0,0.162791,0.837209,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,"Cum aş zidi un val,\nA doua zi iar,\nA treia z...",14,66,13,53,46,20,0,0.19697,0.80303,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,"Cu creionul dus la gură,\nNecăjit fără măsură,...",15,92,14,78,72,20,0,0.152174,0.847826,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
X_eval_real.shape

NameError: name 'X_eval_real' is not defined

In [None]:
train.shape

In [None]:
test.shape

In [None]:
X_eval_real = test[[c for c in test.columns if c !='Versuri']].to_numpy()
X_eval_real.shape

In [None]:
X_test_tfidf_real.shape

In [None]:
y_hat_eval_real = regressor.predict(X_eval_real)

In [None]:
y_hat_eval_real

In [None]:
submission = pd.DataFrame()

In [None]:
submission['Autor'] = y_hat_eval_real

In [None]:
submission['Autor'] = submission['Autor'].apply(lambda x: labels[x])
submission.head()

In [None]:
test

In [None]:
submission['Id'] = ids

In [None]:
submission.to_csv('submission.csv', index=False)
submission.head()