In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
import numpy as np


def load_data(file_paths):
    data_frames = []
    for file_path in file_paths:
        df = pd.read_excel(file_path)
        for i in range(1, 6):
            if f'Code{i}' not in df.columns:
                df[f'Code{i}'] = pd.NA
        df['Codes'] = df.apply(lambda row: [row[f'Code{i}'] for i in range(1, 6) if pd.notna(row[f'Code{i}'])], axis=1)
        data_frames.append(df[['Transcript', 'Codes']])
    return pd.concat(data_frames, ignore_index=True)

# Load and preprocess all coded data
coded_file_paths = [
    '/content/302233_PRECAPI_CAMPINT.POLATTREV_ASK_20161009.xlsx',
    '/content/302261_PRECAPI_CAMPINT.POLATTREV_ASK_20160917.xlsx',
    '/content/302331_PRECAPI_CAMPINT.POLATTREV_ASK_20160924 (1).xlsx',
    '/content/302665_PRECAPI_CAMPINT.POLATTREV_ASK_20160915.xlsx',
    '/content/302746_PRECAPI_CAMPINT.POLATTREV_ASK_20160913.xlsx',
    '/content/302877_PRECAPI_CAMPINT.POLATTREV_ASK_20160913.xlsx',
    '/content/Untitled spreadsheet.xlsx'
]
all_coded_data = load_data(coded_file_paths)

# Initialize the MultiLabelBinarizer
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(all_coded_data['Codes'])

# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(all_coded_data['Transcript'], y, test_size=0.25, random_state=42)

# Creating a text classification pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', MultiOutputClassifier(RandomForestClassifier()))
])

# Training the model
pipeline.fit(X_train.tolist(), y_train)

# Evaluating the model
y_pred = pipeline.predict(X_test.tolist())
print(classification_report(y_test, y_pred, target_names=mlb.classes_))

# Function to automatically code new transcripts
def auto_code_new_transcripts(new_transcript_file_path):
    new_transcripts_df = pd.read_excel(new_transcript_file_path)
    predictions = pipeline.predict(new_transcripts_df['Transcript'].tolist())
    # Ensuring predictions are correctly formatted for inverse_transform
    predictions = np.array(predictions, dtype='int')  # Convert to integer numpy array
    new_transcripts_df['Predicted Codes'] = [mlb.inverse_transform(np.atleast_2d(pred)) for pred in predictions]
    return new_transcripts_df

# Path to the new transcripts Excel file
new_transcript_file_path = '/content/302806_PRECAPI_CAMPINT.POLATTREV_ASK_20161010.xlsx'
coded_transcripts = auto_code_new_transcripts(new_transcript_file_path)

# save the coded transcripts to a new Excel file
coded_transcripts.to_excel('/content/302806_PRECAPI_CAMPINT.POLATTREV_ASK_20161010.xlsx', index=False)

# Function to automatically code new transcripts
def auto_code_new_transcripts(new_transcript_file_path, model, mlb):
    new_transcripts_df = pd.read_excel(new_transcript_file_path)
    if 'Transcript' not in new_transcripts_df.columns:
        raise ValueError("New transcripts file must contain a 'Transcript' column.")

    predictions = model.predict(new_transcripts_df['Transcript'].tolist())
    predictions = np.array(predictions, dtype='int')  # Ensure correct format
    new_transcripts_df['Predicted Codes'] = [mlb.inverse_transform(np.atleast_2d(pred)) for pred in predictions]
    return new_transcripts_df

# Assuming the model and mlb (MultiLabelBinarizer) are already defined and trained
# Path to the new transcripts Excel file
new_transcript_file_path = '/content/302806_PRECAPI_CAMPINT.POLATTREV_ASK_20161010.xlsx'

# Load, predict, and save the auto-coded new transcripts
new_coded_transcripts = auto_code_new_transcripts(new_transcript_file_path, pipeline, mlb)

# Path where you want to save the coded transcripts
output_file_path = '/content/302806_PRECAPI_CAMPINT.POLATTREV_ASK_20161010.xlsx'
new_coded_transcripts.to_excel(output_file_path, index=False)

print(f"New transcripts have been auto-coded and saved to {output_file_path}.")


              precision    recall  f1-score   support

         b31       0.50      1.00      0.67         1
         b51       0.00      0.00      0.00         1
         c21       0.00      0.00      0.00         0
          h1       0.00      0.00      0.00         0
          p1       1.00      1.00      1.00         1

   micro avg       0.67      0.67      0.67         3
   macro avg       0.30      0.40      0.33         3
weighted avg       0.50      0.67      0.56         3
 samples avg       0.33      0.33      0.33         3

New transcripts have been auto-coded and saved to /content/302806_PRECAPI_CAMPINT.POLATTREV_ASK_20161010.xlsx.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


New transcripts have been auto-coded and saved to /content/302806_PRECAPI_CAMPINT.POLATTREV_ASK_20161010.xlsx.
