In [111]:
import pandas as pd
import re
import unicodedata
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib
# nltk.download('stopwords')
# nltk.download('wordnet')

import warnings
warnings.filterwarnings('ignore')

In [112]:
df_train = pd.read_csv("data/cleaned_data.csv")

In [113]:
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,dha iya kuba nobulumko bokubeka umsebenzi naph...
2,eng,province kwazulunatal department transport inv...
3,nso,netefatša gore file dilo moka tše dumelelanego...
4,ven,khomishini ndinganyiso mbeu ewa maana nga mula...


In [128]:
# Assuming you have a DataFrame named 'df' with 'text' and 'lang_id' columns
X = df_train['text']
y = df_train['lang_id']

In [129]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [130]:
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),  # Convert text to TF-IDF features
    ('svm', SVC())  # Support Vector Classifier
])

In [131]:
param_grid = {
    'tfidf__max_features': [1000, 5000, 10000],  # Maximum number of features in TF-IDF
    'svm__C': [0.1, 1, 10],  # Regularization parameter
    'svm__kernel': ['linear', 'rbf']  # Kernel type
}

In [132]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5)
grid_search.fit(X_train, y_train)

In [133]:
accuracy = grid_search.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.9931818181818182


In [125]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [134]:
best_params = grid_search.best_params_
print("Best parameters:", best_params)

Best parameters: {'svm__C': 10, 'svm__kernel': 'rbf', 'tfidf__max_features': 10000}


In [136]:
# Set the best parameters
best_params = {'svm__C': 10, 'svm__kernel': 'rbf', 'tfidf__max_features': 10000}

# Create a pipeline with the best parameters
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=best_params['tfidf__max_features'])),
    ('svm', SVC(C=best_params['svm__C'], kernel=best_params['svm__kernel']))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = pipeline.predict(X_test)

In [137]:
# Evaluate the model's performance
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       0.99      1.00      1.00       615
         nbl       0.98      0.98      0.98       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       1.00      0.99      1.00       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       0.99      0.99      0.99       609
         zul       0.97      0.98      0.98       590

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600



### Evaluate the test data

In [138]:
# Clean and transform the test data
df_test = pd.read_csv("data/cleaned_test_data.csv")

In [139]:
df_test.head()

Unnamed: 0,index,text
0,1,Mmasepala maemo kgethegileng letlelela kgato
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana ngano dza vhathu
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste buitelandse valuta


In [140]:
# Convert text data into numerical representation
X_test_final = vectorizer.transform(df_test['text'])

In [141]:
# Make predictions on the test data
y_pred_test = model.predict(X_test_final)


In [142]:
df_sample = pd.read_csv("data/sample_submission.csv")
df_sample.head()

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl


In [146]:
df_test['index'] = df_test.index + 1  # Add 1 to the index to match the desired format
submission_df = df_test[['index']].copy()  # Select the 'index' column
submission_df['lang_id'] = y_pred_test  # Add 'lang_id' column with predicted values

# Rename the columns as per the format
submission_df.columns = ['index', 'lang_id']

# Save the submission DataFrame to a CSV file
submission_df.to_csv('data/submission_df.csv', index=False)

In [147]:
submission_df

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,nbl
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [149]:
# Save the trained model
joblib.dump(model, 'data/trained_SVC_model.pkl')

['data/trained_SVC_model.pkl']

### Logistic Regression

In [151]:
from sklearn.linear_model import LogisticRegression


In [154]:
# Set the hyperparameters for the logistic regression model
hyperparameters = {
    'logistic__C': 1.0,
    'logistic__penalty': 'l2',
    'tfidf__max_features': 5000
}

# Create a pipeline with the logistic regression model
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=hyperparameters['tfidf__max_features'])),
    ('logistic', LogisticRegression(C=hyperparameters['logistic__C'], penalty=hyperparameters['logistic__penalty']))
])

# Convert the input data to a list of strings (if not already)
X_train = [str(x) for x in X_train]
X_test = [str(x) for x in X_test]

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred_test = pipeline.predict(X_test)

In [155]:
# Evaluate the model's performance
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

         afr       1.00      1.00      1.00       583
         eng       0.99      1.00      0.99       615
         nbl       0.97      0.98      0.98       583
         nso       1.00      1.00      1.00       625
         sot       1.00      1.00      1.00       618
         ssw       0.99      0.98      0.98       584
         tsn       1.00      1.00      1.00       598
         tso       1.00      1.00      1.00       561
         ven       1.00      1.00      1.00       634
         xho       0.98      0.98      0.98       609
         zul       0.96      0.96      0.96       590

    accuracy                           0.99      6600
   macro avg       0.99      0.99      0.99      6600
weighted avg       0.99      0.99      0.99      6600



In [156]:
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_test, y_pred_test)

# Display the confusion matrix
print("Confusion Matrix:")
print(cm)

Confusion Matrix:
[[582   1   0   0   0   0   0   0   0   0   0]
 [  0 615   0   0   0   0   0   0   0   0   0]
 [  1   1 570   0   0   0   0   0   0   4   7]
 [  0   2   0 622   0   0   1   0   0   0   0]
 [  0   0   0   1 616   0   1   0   0   0   0]
 [  0   2   2   0   0 572   0   0   0   0   8]
 [  0   1   0   0   1   0 596   0   0   0   0]
 [  0   1   0   0   0   0   0 560   0   0   0]
 [  0   0   0   0   0   0   0   0 634   0   0]
 [  0   1   3   0   1   1   0   0   0 597   6]
 [  0   0  11   0   0   5   0   0   0   6 568]]


### Evaluate the test data

In [157]:
# Make predictions on the test data
y_pred_test = model.predict(X_test_final)

In [158]:
df_test['index'] = df_test.index + 1  # Add 1 to the index to match the desired format
submission_logreg = df_test[['index']].copy()  # Select the 'index' column
submission_logreg['lang_id'] = y_pred_test  # Add 'lang_id' column with predicted values

# Rename the columns as per the format
submission_logreg.columns = ['index', 'lang_id']

# Save the submission DataFrame to a CSV file
submission_logreg.to_csv('data/submission_logreg.csv', index=False)

In [159]:
submission_logreg

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,nbl
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [160]:
# Save the trained model
joblib.dump(model, 'data/trained_logreg_model.pkl')

['data/trained_logreg_model.pkl']