<a href="https://colab.research.google.com/github/divyaprabhakaran7/Code-Mixed-Spanish-VLM-Study/blob/main/Create_Spanglish_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ✅ Fixing numpy + transformers + other dependencies
!pip install -q --upgrade --force-reinstall numpy==1.23.5
!pip install -q --upgrade --force-reinstall transformers==4.36.2
!pip install -q --upgrade --force-reinstall pandas nltk spacy sentencepiece tqdm
!python -m nltk.downloader punkt averaged_perceptron_tagger averaged_perceptron_tagger_eng
!python -m spacy download es_core_news_sm

import os
os.kill(os.getpid(), 9)  # 🔁 Restart runtime to take effect


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
blosc2 3.2.1 requires numpy>=1.26, but you have numpy 1.23.5 which is incompatible.
jaxlib 0.5.1 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
chex 0.1.89 requires numpy>=1.24.1, but you have numpy 1.23.5 which is incompatible.
pymc 5.21.2 requires numpy>=1.25.0, but you have numpy 1.23.5 which is incompatible.
bigframes 1.42.0 requires numpy>=1.24.0, but you have numpy 1.23.5 which is incompatible.
jax 0.5.2 requires numpy>=1.25, but you have numpy 1.23.5 which is incompatible.
scikit-image 0.25.2 requires numpy>=1.24, but you have numpy 1.23.5 which is incompatible.
treescope 0.1.9 requires numpy>=1.25.2, but you have numpy 1.23.5 which is incompatible.
thinc 

In [2]:
# ✅ Step 1: Install lightweight dependencies first
!pip install -q pandas tqdm

import pandas as pd
from tqdm import tqdm
from google.colab import files

# ✅ Step 2: Upload CSV with English tweets FIRST
print("📁 Upload a CSV with an English tweet column:")
uploaded = files.upload()
df = pd.read_csv(next(iter(uploaded)))

# ✅ Step 3: Preview + detect English column
df.columns = df.columns.str.strip().str.lower()
en_col = [c for c in df.columns if 'caption' in c or 'tweet' in c or 'en' in c][0]
print(f"✅ Using column: '{en_col}'")
print(df[[en_col]].head(3))

# (Optional) Limit for fast testing
# df = df.head(50)

# ✅ Step 4: Install and load NLP + translation tools
!pip install -q nltk spacy transformers sentencepiece
!python -m nltk.downloader punkt averaged_perceptron_tagger averaged_perceptron_tagger_eng
!python -m spacy download es_core_news_sm

# ✅ Step 5: Imports
import nltk
from nltk import word_tokenize, pos_tag, RegexpParser
from transformers import pipeline
import spacy
from functools import lru_cache

# ✅ Step 6: Load tools
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt_tab')
nlp_es = spacy.load("es_core_news_sm")
fast_translator = pipeline("translation", model="Helsinki-NLP/opus-mt-en-es")

@lru_cache(maxsize=None)
def translate_phrase(phrase):
    try:
        result = fast_translator(phrase, max_length=50)[0]['translation_text'].strip()
        return result if result else "[failed]"
    except:
        return "[failed]"

def extract_phrases(text):
    words = word_tokenize(text)
    tagged = pos_tag(words, lang='eng')
    grammar = "PHRASE: {<DT>?<JJ>*<NN.*>+|<VB.*><RB.?>*<IN>?<DT>?<NN.*>+}"
    parser = RegexpParser(grammar)
    tree = parser.parse(tagged)
    phrases = [' '.join(w for w, t in leaf) for leaf in tree.subtrees() if leaf.label() == 'PHRASE']
    return sorted(phrases, key=lambda x: len(x.split()), reverse=True)

# ✅ Step 7: Main Spanglish logic
def create_spanglish_rows(df, en_col='english'):
    second_phrases = []

    for text in df[en_col]:
        phrases = extract_phrases(str(text))
        second = phrases[1] if len(phrases) >= 2 else (phrases[0] if phrases else "")
        second_phrases.append(second)

    translated_phrases = [translate_phrase(p) for p in tqdm(second_phrases)]

    results = []
    for idx in tqdm(range(len(df))):
        en_text = str(df.iloc[idx][en_col])
        eng_phrase = second_phrases[idx]
        span_phrase = translated_phrases[idx]

        if not eng_phrase or span_phrase == "[failed]":
            spanglish = en_text + f" ({span_phrase})"
        elif eng_phrase in en_text:
            spanglish = en_text.replace(eng_phrase, span_phrase)
        else:
            spanglish = en_text + f" ({span_phrase})"

        results.append({
            "Original English": en_text,
            "Phrase (EN)": eng_phrase,
            "Translated (ES)": span_phrase,
            "Spanglish Tweet": spanglish
        })

    return pd.DataFrame(results)

# ✅ Step 8: Run + save
print("⚙️ Processing tweets...")
spanglish_df = create_spanglish_rows(df, en_col=en_col)

spanglish_df.to_csv("spanglish_tweets_output.csv", index=False)
print("✅ Done! Download your result:")
files.download("spanglish_tweets_output.csv")


📁 Upload a CSV with an English tweet column:


Saving filtered_tweets_en_es (3).csv to filtered_tweets_en_es (3) (1).csv
✅ Using column: 'caption'
                                             caption
0       How I feel today #legday #jelly #aching #gym
1  @ArrivaTW absolute disgrace two carriages from...
2  This is my Valentine's from 1 of my nephews. I...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


⚙️ Processing tweets...


100%|██████████| 4222/4222 [30:28<00:00,  2.31it/s]
100%|██████████| 4222/4222 [00:00<00:00, 22456.34it/s]


✅ Done! Download your result:


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>