In [2]:
import pandas as pd
import re
from langdetect import detect, DetectorFactory
from deep_translator import GoogleTranslator
from fuzzywuzzy import process

In [3]:
# Ensure consistent results from langdetect
DetectorFactory.seed = 0

In [4]:
# Function to clean queries
def clean_text(text):
    text = str(text).strip().lower()  # Lowercase and strip spaces
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    text = re.sub(r"[^\w\s]", "", text)  # Remove special characters
    return text

In [5]:
# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except:
        return "unknown"

In [6]:
crops = ['green_gram', 'red_gram', 'black_gram', 'groundnut', 'rice', 'cotton']

In [8]:
for crop in crops:
    filename = f"{crop}.csv"
    df = pd.read_csv(filename)

    # Step 1: Detect language
    df["Language"] = df["KccAns"].apply(detect_language)

    # Step 2: Split into English and Telugu files
    df_telugu = df[df["Language"] == "te"]
    df_english = df[df["Language"] == "en"]

    # Save separate files
    df_telugu.to_csv(f"{crop}_telugu.csv", index=False)
    df_english.to_csv(f"{crop}_english.csv", index=False)

In [12]:
for crop in crops:
    filename = f"{crop}_english.csv"
    df = pd.read_csv(filename)
    print(f"{crop}_english length: {len(df)}")
    filename = f"{crop}_telugu.csv"
    df = pd.read_csv(filename)
    print(f"{crop}_telugu length: {len(df)}")

green_gram_english length: 1636
green_gram_telugu length: 891
red_gram_english length: 1565
red_gram_telugu length: 1290
black_gram_english length: 3765
black_gram_telugu length: 2505
groundnut_english length: 4965
groundnut_telugu length: 3084
rice_english length: 9978
rice_telugu length: 8263
cotton_english length: 5551
cotton_telugu length: 3468
