## BREAK

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
import re

# def normalize_variants(df):
#     """
#     Standardize only the 3D-printing and additive-manufacturing variants
#     in df['Normalized Text'], so downstream vectorization sees the
#     unified phrases.
#     """
#     fixes = {

#     # research/researchers → research
#     r'\bresearchers?\b':'research',
#     # model/models → models
#     r'\bmodels?\b':'models',
#     # part/parts → parts
#     r'\bparts?\b':'parts',
        
#     # gun / guns → "guns"
#     r'\bgun(?:s)?\b':'guns',
    
#     # 3d printing variants → "3d printing"
#     r'\b3d[-\s]*print(?:ing|ed|ted)?\b':'3d printing',
#     r'\b3dprinting\b':'3d printing',

#     # additive manufacturing variants → "additive manufacturing"
#     # match “additive manufacture” + optional “d” or “ing”
#     r'\badditive[-\s]*manufactur(?:e|ed|ing)?\b':'additive manufacturing',
#     # catch concatenated form “additivemanufacturing”
#     r'\badditivemanufactur(?:e|ing)\b':'additive manufacturing',
#     # any other small misspelling
#     r'\baddive[-\s]*manufactu(?:ed|ring)?\b':'additive manufacturing',

#     # materials/material → material
#     r'\bmaterials?\b':'material',

#     r'\b3d[-\s]*printing technology\b':'3d printing technology',
#     r'\bprinting technology\b':'3d printing technology',
        
#     # **ADD THESE LINES**  
#     r'\b3d[-\s]*printer(?:s)?\b':'3d printer',
#     r'\b3dprinter(?:s)?\b':'3d printer',
#     r'\bprinters?\b':'printer',
#     }

#      fixes.update({
#       # use/used
#       r'\bused\b':                     'use',
#       # printing guns → 3d printing guns
#       r'\bprinting guns\b':            '3d printing guns',
#       # first 3d / first 3d printing
#       r'\bfirst\s+3d\b':               '3d',
#       r'\bfirst\s+3d\s+printing\b':    '3d printing',
#       # 3d print-industry
#       r'\b3dprintindustry\b':          '3d printing industry',
#       r'\b3d[-\s]*print(?:ing)?\s+industry\b': '3d printing industry',
#     })

#     text = df['Normalized Text'].fillna('')
#     for pat, repl in fixes.items():
#         text = text.str.replace(pat, repl, regex=True, flags=re.IGNORECASE)
#     df = df.copy()
#     df['Normalized Text'] = text
#     return df


import re

def normalize_variants(df):
    """
    Standardize variants in df['Normalized Text'] so that downstream
    n-grams come out unified.
    """
    fixes = {
        # research/researchers → research
        r'\bresearch(?:ers)?\b':                   'research',

        # model/models → models
        r'\bmodels?\b':                           'models',
        # part/parts → parts
        r'\bparts?\b':                            'parts',

        # gun/guns → guns
        r'\bgun(?:s)?\b':                         'guns',

        # 3d printing variants → "3d printing"
        r'\b3d[-\s]*print(?:ing|ed|ted)?\b':      '3d printing',
        r'\b3dprinting\b':                        '3d printing',

        # 3d printer variants → "3d printer"
        r'\b3d[-\s]*printer(?:s)?\b':             '3d printer',
        r'\b3dprinter(?:s)?\b':                   '3d printer',
        r'\bprinters?\b':                         'printer',

        # additive manufacturing variants → "additive manufacturing"
        r'\badditive[-\s]*manufactur(?:e|ed|ing)?\b': 'additive manufacturing',
        r'\badditivemanufactur(?:e|ing)\b':           'additive manufacturing',
        r'\baddive[-\s]*manufactu(?:ed|ring)?\b':      'additive manufacturing',

        # materials/material → material
        r'\bmaterials?\b':                         'material',

        # use/used → use
        r'\bused\b':                               'use',

        # printing guns → 3d printing guns
        #r'\bprinting guns\b':                      '3d printing guns',

        # first 3d / first 3d printing → 3d( printing)
        r'\bfirst\s+3d\s+printing\b':              '3d printing',
        r'\bfirst\s+3d\b':                         '3d',

        # 3dprintindustry → 3d printing industry
        r'\b3dprintindustry\b':                    '3d printing industry',
        r'\b3d[-\s]*print(?:ing)?\s+industry\b':   '3d printing industry',

        # robot/robots → robotics
        r'\brobots?\b': 'robotics',
    }

    # Apply each regex replacement, case-insensitive
    text = df['Normalized Text'].fillna('')
    for pattern, replacement in fixes.items():
        text = text.str.replace(pattern, replacement, regex=True, flags=re.IGNORECASE)

    df2 = df.copy()
    df2['Normalized Text'] = text
    return df2



def extract_year_ngrams(df, custom_stopwords, n_range=(1, 3), min_df=2, top_n=50):
    texts = df['Normalized Text'].fillna('').tolist()
    if not texts:
        return pd.DataFrame(columns=['ngram', 'frequency'])
    vec = CountVectorizer(
        ngram_range=n_range,
        stop_words=list(custom_stopwords),
        min_df=min_df,
        token_pattern=r'\b[a-zA-Z0-9_]+\b'
    )
    X = vec.fit_transform(texts)
    names = vec.get_feature_names_out()
    counts = X.sum(axis=0).A1
    df_ngrams = pd.DataFrame({'ngram': names, 'frequency': counts})
    return df_ngrams.sort_values('frequency', ascending=False).head(top_n)

def find_trending_terms(all_ngrams_df, min_years=2, top_n=15):
    # work on a copy so the original stays intact
    df = all_ngrams_df.copy()

    # remove unwanted terms and pure numbers
    remove_terms = {
        'help', 'get', 'twitter', 'like', 'one', 'using', 'see',
        'youtube', 'today', 'check', 'us', 'made', 'first', 'time', 'learn', 'de', 'want', 'first',
        'week', 'know', 'good', 'year', 'day', 'need', 'project', 'would','high','free', 'think',
        'utm_medium', 'utm_source', 'igshid',
        'post','way','also', 'thanks',
        'something','coming','utm_source twitter','years',
        'two', 'look', 'going', 'latest', 'got', 'take', 'utm_campaign', 'x', 
        'come', 'another', 'hours', 'really', 'utm_source twitter', 'youtube video'

        # generic filler / low-value
        'work', 'heart', 'read', 'people', 'set', 'based', 'designed',
        'social', 'building', 'go', 'best', 'full', 'team', 'life',
        'ready', 'buy', 'live', 'watch', 'use', 'working', 'things',
        'love', 'stuff', 'thing', 'back', 'even', 'many', 'make',
        'build', 'find', 'start', 'last', 'ever', 'test', 'share',
        'new', 'r', 'e', 'w', 'hp', 'drone', 'app'

        
    }

    remove_terms.update({
    'rocket', 'ai', 'engineering', 'fashion', 'stem',
    'university', 'school', 'lab', 'students',
    'go', 'market', 'global', 'construction', 'process', 'innovation',
    'maker', 'making',
    'fun', 'love', 'life', 'watch', 'read'
})
    mask = (~df['ngram'].isin(remove_terms)) & (~df['ngram'].str.isdigit())
    df = df[mask]

    # keep only terms that appear in at least min_years distinct years
    common = df.groupby('ngram').filter(
        lambda grp: grp['year'].nunique() >= min_years
    )

    records = []
    for term, grp in common.groupby('ngram'):
        grp = grp.sort_values('year')
        first, last = grp.iloc[0], grp.iloc[-1]
        abs_ch = last['freq_per_100_tweets'] - first['freq_per_100_tweets']
        if abs_ch <= 0:
            continue
        if first['freq_per_100_tweets'] > 0:
            rel_ch = abs_ch / first['freq_per_100_tweets']
        else:
            rel_ch = float('inf')
        records.append({
            'term':       term,
            'first_year': int(first['year']),
            'last_year':  int(last['year']),
            'first_freq': first['freq_per_100_tweets'],
            'last_freq':  last['freq_per_100_tweets'],
            'abs_change': abs_ch,
            'rel_change': rel_ch,
            'word_count': int(first['word_count'])
        })

    df_trend = pd.DataFrame(records)
    return df_trend.sort_values('rel_change', ascending=False).head(top_n)

def main():
    # 1) Setup
    nltk.download('stopwords')
    nltk.download('punkt')
    stops = set(stopwords.words('english'))
    twitter_stops = {'rt','amp','http','https','co','t','s','m','re','ve','ll','d'}
    custom_stops = stops.union(twitter_stops)

    # 2) Read raw tweets
    df = pd.read_csv('CLEANED_DATASET_20250520_135803.csv', parse_dates=['Date'])
    df['year'] = df['Date'].dt.year

    # 2a) Normalize all target variants before n-gram extraction
    df = normalize_variants(df)

    # 3) Build yearly n-gram table
    results = []
    for year, sub in df.groupby('year'):
        if len(sub) < 5:
            continue
        ngr = extract_year_ngrams(sub, custom_stops, n_range=(1, 3), min_df=2, top_n=200)
        ngr['year'] = year
        ngr['tweet_count'] = len(sub)
        ngr['freq_per_100_tweets'] = 100 * ngr['frequency'] / len(sub)
        ngr['word_count'] = ngr['ngram'].str.split().apply(len)
        results.append(ngr)

    all_ngrams = pd.concat(results, ignore_index=True)
    #all_ngrams.to_csv('yearly_ngram_frequencies.csv', index=False)

    # 4) Find top trending terms
    trending = find_trending_terms(all_ngrams, min_years=2, top_n=1000)
    trending.to_csv('trending_terms.csv', index=False)

    # 5) Print results
    print("Top trending terms:")
    for _, r in trending.iterrows():
        print(f"{r.term}: {r.rel_change:.2f}× growth "
              f"({r.first_freq:.2f}→{r.last_freq:.2f} per 100 tweets "
              f"{r.first_year}→{r.last_year})")

if __name__ == '__main__':
    main()


[nltk_data] Downloading package stopwords to /home/shola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/shola/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top trending terms:
pla: 2.66× growth (0.66→2.41 per 100 tweets 2016→2023)
rocket: 2.63× growth (0.77→2.79 per 100 tweets 2013→2023)
material: 1.99× growth (1.29→3.88 per 100 tweets 2012→2023)
additive: 1.84× growth (3.25→9.22 per 100 tweets 2012→2023)
additive manufacturing: 1.79× growth (3.10→8.66 per 100 tweets 2012→2023)
engineering: 1.57× growth (0.66→1.71 per 100 tweets 2016→2023)
filament: 1.40× growth (1.15→2.76 per 100 tweets 2014→2023)
models: 1.33× growth (2.18→5.07 per 100 tweets 2012→2023)
resin: 1.19× growth (1.07→2.35 per 100 tweets 2019→2023)
research: 1.18× growth (1.08→2.35 per 100 tweets 2012→2023)
printing 3d printing: 1.13× growth (0.84→1.78 per 100 tweets 2012→2023)
parts: 1.09× growth (2.51→5.25 per 100 tweets 2012→2023)
market: 1.09× growth (0.81→1.70 per 100 tweets 2013→2023)
construction: 1.04× growth (0.74→1.52 per 100 tweets 2017→2023)
ai: 1.00× growth (1.05→2.10 per 100 tweets 2018→2023)
manufacturing: 0.99× growth (5.76→11.45 per 100 tweets 2012→2023)
inno