In [21]:
!pip install -q pandas numpy scikit-learn openpyxl


In [22]:
# Import libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [23]:
# --- Custom Transformers ---

class UnicodeTextCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(lambda x: re.sub(r'[^\w\s\u0C80-\u0CFF]', '', str(x).lower()))  # Kannada unicode range

class TextStats(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({
            'char_count': X.apply(len),
            'word_count': X.apply(lambda x: len(x.split())),
            'sentence_count': X.apply(lambda x: len(re.findall(r'[.!?]+', x))),
        })

class PunctuationStats(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({
            'exclamation_count': X.apply(lambda x: x.count('!')),
            'question_count': X.apply(lambda x: x.count('?')),
            'comma_count': X.apply(lambda x: x.count(',')),
            'period_count': X.apply(lambda x: x.count('.')),
        })

class WordLengthStats(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return pd.DataFrame({
            'avg_word_length': X.apply(lambda x: np.mean([len(word) for word in x.split()]) if x.split() else 0)
        })

In [25]:
# --- Load your data (you can upload to Colab or use files.upload) ---

# Assuming the file is named data.xlsx
df = pd.read_excel("data 1.xlsx").dropna()
X = df['text']
y = df['dialect']

In [26]:
# --- Train/Test Split ---
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [27]:
# --- Feature Engineering ---

combined_features = FeatureUnion([
    ('tfidf', Pipeline([
        ('cleaner', UnicodeTextCleaner()),
        ('tfidf_vectorizer', TfidfVectorizer(ngram_range=(1, 2), max_features=5000))
    ])),
    ('stats', Pipeline([
        ('cleaner', UnicodeTextCleaner()),
        ('stats_union', FeatureUnion([
            ('text_stats', TextStats()),
            ('punctuation_stats', PunctuationStats()),
            ('word_length_stats', WordLengthStats())
        ]))
    ]))
])


In [28]:
# --- Full Pipeline ---
pipeline = Pipeline([
    ('features', combined_features),
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=150))
])

In [29]:
# --- Train Model ---
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)



In [30]:
# --- Evaluation ---
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

 Arre bhashe       0.88      0.81      0.84       181
     kannada       0.93      0.96      0.94       468

    accuracy                           0.92       649
   macro avg       0.90      0.88      0.89       649
weighted avg       0.91      0.92      0.91       649



In [18]:
# --- Optional: Predict on new samples ---
test_samples = [
    ("Arre bhashe", "ಅಭುಟ್ಟ ಎಂದೆಕೆ ಬರ್ಪಿನಿ?"),
    ("kannada", "ಈ ಬಾರಿ ಚುನಾವಣೆಯಲ್ಲಿ ಜನತೆ ಉತ್ತಮ ನಿರ್ಧಾರ ತೆಗೆದುಕೊಳ್ಳಬೇಕಾಗಿದೆ."),
]
texts = [text for label, text in test_samples]
true_labels = [label for label, text in test_samples]

texts_series = pd.Series(texts)
predicted_labels = pipeline.predict(texts_series)


In [19]:
# Show predictions
results_df = pd.DataFrame({
    "Text": texts,
    "True Label": true_labels,
    "Predicted Label": predicted_labels
})

In [20]:
from tabulate import tabulate
print(tabulate(results_df, headers='keys', tablefmt='grid', showindex=False))


+----------------------------------------+--------------+-------------------+
| Text                                   | True Label   | Predicted Label   |
| ಅಭುಟ್ಟ ಎಂದೆಕೆ ಬರ್ಪಿನಿ?                         | Arre bhashe  | Arre bhashe       |
+----------------------------------------+--------------+-------------------+
| ಈ ಬಾರಿ ಚುನಾವಣೆಯಲ್ಲಿ ಜನತೆ ಉತ್ತಮ ನಿರ್ಧಾರ ತೆಗೆದುಕೊಳ್ಳಬೇಕಾಗಿದೆ. | kannada      | kannada           |
+----------------------------------------+--------------+-------------------+
