In [None]:
!pip install pandas scikit-learn joblib emoji clean-text[gpl] unidecode

import pandas as pd
import re, emoji, unidecode
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import joblib
from cleantext import clean
from google.colab import files

print("Upload your teacher_dataset_100.csv file")
uploaded = files.upload()
dataset_path = list(uploaded.keys())[0]

df = pd.read_csv(dataset_path)
print(f"Dataset loaded: {df.shape[0]} teachers")
print(df.head())

teacher_names = df['Teacher_Name'].tolist()
teacher_ids = df['Teacher_ID'].tolist()

intents = []
queries = []

for name, tid in zip(teacher_names[:30], teacher_ids[:30]):
    queries += [
        f"Where is {name}?",
        f"{tid} kaha milenge?",
        f"{name} abhi kahan hain?",
        f"{tid} ki location kya hai?",
        f"Tell me the location of {name}"
    ]
    intents += ["find_location"]*5

for name, tid in zip(teacher_names[30:60], teacher_ids[30:60]):
    queries += [
        f"Is {name} free right now?",
        f"{tid} ka free time kab hai?",
        f"When is {name} available?",
        f"{tid} abhi padhate hain kya?",
        f"{name} ki next free slot kya hai?"
    ]
    intents += ["check_availability"]*5

for name, tid in zip(teacher_names[60:90], teacher_ids[60:90]):
    queries += [
        f"Book appointment with {name} at 2 PM",
        f"Mujhe {tid} ke sath 3 baje milna hai",
        f"Can I meet {name} tomorrow?",
        f"{tid} se baat karni hai",
        f"Schedule meeting with {name}"
    ]
    intents += ["book_appointment"]*5

greetings = ["Hi", "Hello", "Namaste", "Hey bot", "Good morning", "Kaise ho", "Are you there?", "Hi assistant"]
queries += greetings
intents += ["greeting"]*len(greetings)

fallbacks = ["What is the weather?", "Tell me a joke", "Open camera", "Play music", "Who is Elon Musk?"]
queries += fallbacks
intents += ["fallback"]*len(fallbacks)

train_df = pd.DataFrame({"query": queries, "intent": intents})
print(f"Training Samples: {train_df.shape[0]}")
print(train_df['intent'].value_counts())

def preprocess(text):
    text = str(text)
    text = text.lower()
    text = unidecode.unidecode(text)
    text = emoji.replace_emoji(text, replace="")
    text = clean(text,
                 fix_unicode=True,
                 to_ascii=True,
                 lower=True,
                 no_urls=True,
                 no_emails=True,
                 no_phone_numbers=True,
                 no_numbers=False,
                 no_digits=False,
                 no_currency_symbols=True,
                 no_punct=False)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

train_df['clean_query'] = train_df['query'].apply(preprocess)

X_train, X_test, y_train, y_test = train_test_split(
    train_df['clean_query'],
    train_df['intent'],
    test_size=0.2,
    random_state=42,
    stratify=train_df['intent']
)

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1,2), max_features=8000)),
    ("clf", LogisticRegression(max_iter=300, class_weight='balanced'))
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print("Model Evaluation Report:")
print(classification_report(y_test, y_pred))

MODEL_FILENAME = "teacher_intent_model.pkl"
joblib.dump(pipeline, MODEL_FILENAME)

df.to_csv("teacher_dataset_100.csv", index=False)

files.download(MODEL_FILENAME)
files.download("teacher_dataset_100.csv")

print("Model + dataset saved successfully!")


Collecting emoji
  Using cached emoji-2.15.0-py3-none-any.whl.metadata (5.7 kB)
Collecting unidecode
  Using cached Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting clean-text[gpl]
  Using cached clean_text-0.6.0-py3-none-any.whl.metadata (6.6 kB)
Collecting emoji
  Using cached emoji-1.7.0.tar.gz (175 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy<7.0,>=6.0 (from clean-text[gpl])
  Using cached ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Using cached Unidecode-1.4.0-py3-none-any.whl (235 kB)
Using cached ftfy-6.3.1-py3-none-any.whl (44 kB)
Using cached clean_text-0.6.0-py3-none-any.whl (11 kB)
Building wheels for collected packages: emoji
  Building wheel for emoji (setup.py) ... [?25l[?25hdone
  Created wheel for emoji: filename=emoji-1.7.0-py3-none-any.whl size=171031 sha256=8d60dcb65c814dc152b391f3b3ac35f970964a856b9dab68cef062c8a48d48be
  Stored in directory: /root/.cache/pip/wheels/e0/8c/e0/294d2e4ea0e55792bfc99b6b263e4a0511443da7b69af676

Saving teacher_dataset_100 (1).csv to teacher_dataset_100 (1).csv
Dataset loaded: 100 teachers
  Teacher_ID  Teacher_Name      Subject    Block  Room_Number  Cabin_Number  \
0       T101      Arun Das          Law  Block C          102           448   
1       T102   Qadir Mehta   Statistics  Block B          400           192   
2       T103   Juhi Sharma    Economics  Block B          437           367   
3       T104  Pooja Sharma  Linguistics  Block A          216           417   
4       T105    Pooja Bose    Economics  Block D          185           483   

  Lecture_Start Lecture_End Free_Start Free_End Available_Days  
0         11:30       12:30      12:00    13:00        Mon-Fri  
1         11:00       12:00      12:30    13:30        Mon-Fri  
2         16:30       17:30      10:00    11:00        Mon-Fri  
3         16:00       17:00      10:00    11:00        Mon-Fri  
4         08:00       09:00      09:30    10:30        Mon-Sat  
Training Samples: 463
intent
find_locati

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Model + dataset saved successfully!
