In [None]:
import os
import pandas as pd
import numpy as np
from http import HTTPStatus
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
import joblib
import dashscope
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize NLP
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Directly API
df = pd.read_csv('daigt_v2_train_preproc_NLP_aug_v1 2.csv')   # Replace with your actual API key

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize text
    tokens = word_tokenize(text)
    # Remove stopwords and apply stemming and lemmatization
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Function to call Tongyi Qianwen API
def call_tongyi_qianwen(text):
    response = dashscope.Generation.call(
        model="qwen_1_8b_chat",  # Assuming this model name is correct, replace as necessary
        prompt=text,
        result_format='message',
    )
    if response.status_code == HTTPStatus.OK:
        return response.output.get("choices")[0].get("message").get("content") if response.output.get("choices") else None
    else:
        print(f'Request id: {response.request_id}, Status code: {response.status_code}, error code: {response.code}, error message: {response.message}')
        return None

# Load  dataset

df = pd.read_csv('daigt_v2_train_preproc_NLP_aug_v1 2.csv')
df['processed_text'] = df['text'].apply(preprocess_text)

# Feature extraction with TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1,2))
tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text'])

# Use Tongyi Qianwen API to enhance text features
df['tongyi_feature'] = df['processed_text'].apply(call_tongyi_qianwen)

# Simplified feature engineering: length of the response as a feature
df['tongyi_feature_length'] = df['tongyi_feature'].apply(lambda x: len(x) if x else 0)
X = np.hstack((tfidf_features.toarray(), np.array(df['tongyi_feature_length']).reshape(-1, 1)))
y = df['label']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a pipeline with RandomForestClassifier and additional steps if needed
pipeline = Pipeline([
    ('clf', RandomForestClassifier(random_state=42))
])

# Hyperparameter tuning with GridSearchCV
param_grid = {
    'clf__n_estimators': [100, 200, 300],
    'clf__max_depth': [10, 20, 30]
}
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')

# Perform cross-validation and fit the model
cv_scores = cross_val_score(grid_search, X_train, y_train, cv=5)
print(f'Cross-validation scores: {cv_scores}')
best_model = grid_search.fit(X_train, y_train)

# Make predictions on the test set and evaluate
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')

# Save the model and TF-IDF vectorizer
joblib.dump(best_model, 'text_classification_model.pkl')
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

