In [6]:
import numpy as np
import pandas as pd
import wittgenstein as lw
import os
import re
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [8]:
def split_document(text, max_length=250, overlap=50):
    # Split text into words
    words = text.split()
    parts = []
    if len(words) <= max_length:
        return [text]  # Return the entire text if it's short enough

    i = 0
    while i < len(words):
        # Ensure that we don't exceed the text length
        end_index = min(i + max_length, len(words))
        # Join the selected range of words back into a string
        chunk_text = " ".join(words[i:end_index])
        parts.append(chunk_text)
        i += (max_length - overlap)
    
    return parts

def load_data(directory_paths):
    texts, labels = [], []
    for dialect, directory in directory_paths.items():
        for filename in os.listdir(directory):
            filepath = os.path.join(directory, filename)
            with open(filepath, 'r', encoding='latin-1') as file:
                text = file.read().strip()
                text = re.sub(r'\d+', '', text)  # Remove numbers
                chunks = split_document(text)
                for chunk in chunks:
                    texts.append(chunk)
                    labels.append(dialect)  # Each chunk gets the same label as the original document
    return texts, labels




# Define your directory paths correctly
directory_paths = {
    # 'other': '../language_resources/dialects_mixed_txt/other_quechua/',
    'qub': '../language_resources/dialects_mixed_txt/qub',
    'quf': '../language_resources/dialects_mixed_txt/quf',
    'quh': '../language_resources/dialects_mixed_txt/quh',
    'quk': '../language_resources/dialects_mixed_txt/quk',
    'qul': '../language_resources/dialects_mixed_txt/qul',
    'qup': '../language_resources/dialects_mixed_txt/qup',
    'quw': '../language_resources/dialects_mixed_txt/quw',
    'qux': '../language_resources/dialects_mixed_txt/qux',
    'quy': '../language_resources/dialects_mixed_txt/quy',
    'quz': '../language_resources/dialects_mixed_txt/quz', 
    'qvc': '../language_resources/dialects_mixed_txt/qvc',
    'qve': '../language_resources/dialects_mixed_txt/qve',  
    'qvi': '../language_resources/dialects_mixed_txt/qvi',
    'qvm': '../language_resources/dialects_mixed_txt/qvm',
    'qvn': '../language_resources/dialects_mixed_txt/qvn',
    'qvo': '../language_resources/dialects_mixed_txt/qvo',
    'qvw': '../language_resources/dialects_mixed_txt/qvw',
    'qvz': '../language_resources/dialects_mixed_txt/qvz',
    'qwh': '../language_resources/dialects_mixed_txt/qwh',
    'qxl': '../language_resources/dialects_mixed_txt/qxl',
    'qxh': '../language_resources/dialects_mixed_txt/qxh',
    'qxn': '../language_resources/dialects_mixed_txt/qxn',
    'qxo': '../language_resources/dialects_mixed_txt/qxo',
    'qxr': '../language_resources/dialects_mixed_txt/qxr'
    # 'southern': '../language_resources/dialects_mixed_txt/southern_quechua'
}


In [9]:
# Load data
texts, labels = load_data(directory_paths)
data = pd.DataFrame({'text': texts, 'label': labels})

# Feature extraction (basic example, you might want to customize this)
data['text'] = data['text'].apply(lambda x: x.lower())  #  simple text pre-processing

# Create train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['text'], data['label'], test_size=0.2, random_state=42)

# Initialize and train the JRip-like classifier
ripper_clf = lw.RIPPER()
ripper_clf.fit(X_train, y_train, class_feat='label', pos_class='specific_dialect')

# Predict and evaluate
y_pred = ripper_clf.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))

No positive samples. Existing target labels=['qxr', 'qvi', 'quy', 'quk', 'qvw', 'qvo', 'qxh', 'qvm', 'quw', 'qul', 'quh', 'qxl', 'qvn', 'qup', 'quz', 'qve', 'qxo', 'qxn', 'qwh', 'quf', 'qux', 'qvc', 'qvz', 'qub'].

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.

Ruleset is empty. All predictions it makes with method .predict will be negative. It may be untrained or was trained on a dataset split lacking positive examples.



ValueError: Mix of label input types (string and number)

In [10]:
# Load and preprocess data
texts, labels = load_data(directory_paths)  # Provide your actual directory paths
features, vectorizer = preprocess_texts(texts)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Train a decision tree model
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# Extract rules from the decision tree
tree_rules = export_text(model, feature_names=vectorizer.get_feature_names())

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

# Print the rules
print("Decision Tree Rules:\n", tree_rules)



AttributeError: 'TfidfVectorizer' object has no attribute 'get_feature_names'