## BREAK

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import nltk
import re

def normalize_variants(df):
    """
    Standardize variants in df['Normalized Text'] so that downstream
    n-grams come out unified.
    """
    fixes = {

        # FIRST: capture any “printing guns” phrase exactly
        r'\bprinting\s+guns\b':                   '3d printing guns',
    
        # THEN: capture lone “gun” or “guns”
        r'\bgun(?:s)?\b':                         '3d printing guns',
        
        # research/researchers → research
        r'\bresearch(?:ers)?\b':                   'research',

        # model/models → models
        r'\bmodels?\b':                           'models',
        # part/parts → parts
        r'\bparts?\b':                            'parts',

        # 3d printing variants → "3d printing"
        r'\b3d[-\s]*print(?:ing|ed|ted)?\b':      '3d printing',
        r'\b3dprinting\b':                        '3d printing',

        # 3d printer variants → "3d printer"
        r'\b3d[-\s]*printer(?:s)?\b':             '3d printer',
        r'\b3dprinter(?:s)?\b':                   '3d printer',
        r'\bprinters?\b':                         'printer',

        # additive manufacturing variants → "additive manufacturing"
        r'\badditive[-\s]*manufactur(?:e|ed|ing)?\b': 'additive manufacturing',
        r'\badditivemanufactur(?:e|ing)\b':           'additive manufacturing',
        r'\baddive[-\s]*manufactu(?:ed|ring)?\b':      'additive manufacturing',

        # materials/material → material
        r'\bmaterials?\b':                         'material',

        # use/used → use
        r'\bused\b':                               'use',


        # first 3d / first 3d printing → 3d( printing)
        r'\bfirst\s+3d\s+printing\b':              '3d printing',
        r'\bfirst\s+3d\b':                         '3d',

        # 3dprintindustry → 3d printing industry
        r'\b3dprintindustry\b':                    '3d printing industry',
        r'\b3d[-\s]*print(?:ing)?\s+industry\b':   '3d printing industry',

        # rapid prototyping variants → "rapid prototyping"
        r'\brapid[-\s]*prototyp(?:e|ed|ing)?\b':      'rapid prototyping',
        r'\brapidprototyp(?:e|ing)\b':                'rapid prototyping',

        # robot/robots → robotics
        r'\brobots?\b': 'robotics',
    }

    # Apply each regex replacement, case-insensitive
    text = df['Normalized Text'].fillna('')
    for pattern, replacement in fixes.items():
        text = text.str.replace(pattern, replacement, regex=True, flags=re.IGNORECASE)

    df2 = df.copy()
    df2['Normalized Text'] = text
    return df2



def extract_year_ngrams(df, custom_stopwords, n_range=(1, 3), min_df=2, top_n=50):
    texts = df['Normalized Text'].fillna('').tolist()
    if not texts:
        return pd.DataFrame(columns=['ngram', 'frequency'])
    vec = CountVectorizer(
        ngram_range=n_range,
        stop_words=list(custom_stopwords),
        min_df=min_df,
        token_pattern=r'\b[a-zA-Z0-9_]+\b'
    )
    X = vec.fit_transform(texts)
    names = vec.get_feature_names_out()
    counts = X.sum(axis=0).A1
    df_ngrams = pd.DataFrame({'ngram': names, 'frequency': counts})
    return df_ngrams.sort_values('frequency', ascending=False).head(top_n)

def find_trending_terms(all_ngrams_df, min_years=2, top_n=15):
    # work on a copy so the original stays intact
    df = all_ngrams_df.copy()

    # remove unwanted terms and pure numbers
    remove_terms = {
        'help', 'get', 'twitter', 'like', 'one', 'using', 'see',
        'youtube', 'today', 'check', 'us', 'made', 'first', 'time', 'learn', 'de', 'want', 'first',
        'week', 'know', 'good', 'year', 'day', 'need', 'project', 'would','high','free', 'think',
        'utm_medium', 'utm_source', 'igshid',
        'post','way','also', 'thanks',
        'something','coming','utm_source twitter','years',
        'two', 'look', 'going', 'latest', 'got', 'take', 'utm_campaign', 'x', 
        'come', 'another', 'hours', 'really', 'utm_source twitter', 'youtube video'

        # generic filler / low-value
        'work', 'heart', 'read', 'people', 'set', 'based', 'designed',
        'social', 'building', 'go', 'best', 'full', 'team', 'life',
        'ready', 'buy', 'live', 'watch', 'use', 'working', 'things',
        'love', 'stuff', 'thing', 'back', 'even', 'many', 'make',
        'build', 'find', 'start', 'last', 'ever', 'test', 'share',
        'new', 'r', 'e', 'w', 'hp', 'drone', 'app', 'twitter',

        #
        'utm_source twitter', 'utm_source dlvr utm_medium','utm_source dlvr',
        'utm_medium twitter', 'youtube video', 'dlvr utm_medium twitter', 
        'dlvr utm_medium', 'liked youtube', 'liked youtube video', 
        'promo code', 'twitter utm_medium social', 'buy 3d printer', 
        'social utm_campaign social', 'utm_campaign social', 'looks like', 
        'twitter utm_medium', 'learn 3d printing', 'video youtube', 'need 3d', 'utm_source twitter utm_medium', 'utm_medium social',
        'first ever','see 3d', 'utm_source twitter 3d',
        'join us','via youtube','like 3d', 'twitter 3d printing', '3d 3d', 'like 3d printing', 'buy 3d', 'men women',
        'world 3d', 'could 3d', 'printing one', 'thanks 3d', 'fully 3d', 'love 3d', 'using 3d', 'first time', 'learn 3d'
    }

    remove_terms.update({
    'engineering', 'fashion', 'stem',
    'university', 'school', 'lab', 'students',
    'go', 'market', 'global', 'construction', 'process', 'innovation',
    'maker', 'making',
    'fun', 'love', 'life', 'watch', 'read'
    })
    
    mask = (~df['ngram'].isin(remove_terms)) & (~df['ngram'].str.isdigit())
    df = df[mask]

    # keep only terms that appear in at least min_years distinct years
    common = df.groupby('ngram').filter(
        lambda grp: grp['year'].nunique() >= min_years
    )

    records = []
    for term, grp in common.groupby('ngram'):
        grp = grp.sort_values('year')
        first, last = grp.iloc[0], grp.iloc[-1]
        abs_ch = last['freq_per_100_tweets'] - first['freq_per_100_tweets']
        if abs_ch <= 0:
            continue
        if first['freq_per_100_tweets'] > 0:
            rel_ch = abs_ch / first['freq_per_100_tweets']
        else:
            rel_ch = float('inf')
        records.append({
            'term':       term,
            'first_year': int(first['year']),
            'last_year':  int(last['year']),
            'first_freq': first['freq_per_100_tweets'],
            'last_freq':  last['freq_per_100_tweets'],
            'abs_change': abs_ch,
            'rel_change': rel_ch,
            'word_count': int(first['word_count'])
        })

    df_trend = pd.DataFrame(records)
    return df_trend.sort_values('rel_change', ascending=False).head(top_n)

def main():
    # Setup
    nltk.download('stopwords')
    nltk.download('punkt')
    stops = set(stopwords.words('english'))
    twitter_stops = {'rt','amp','http','https','co','t','s','m','re','ve','ll','d'}
    custom_stops = stops.union(twitter_stops)

    # Read raw tweets
    df = pd.read_csv('CLEANED_DATASET_20250520_135803.csv', parse_dates=['Date'])
    df['year'] = df['Date'].dt.year

    # Normalize all target variants before n-gram extraction
    df = normalize_variants(df)

    # Build yearly n-gram table
    results = []
    for year, sub in df.groupby('year'):
        if len(sub) < 5:
            continue
        ngr = extract_year_ngrams(sub, custom_stops, n_range=(2,2), min_df=2, top_n=200)
        ngr['year'] = year
        ngr['tweet_count'] = len(sub)
        ngr['freq_per_100_tweets'] = 100 * ngr['frequency'] / len(sub)
        ngr['word_count'] = ngr['ngram'].str.split().apply(len)
        results.append(ngr)

    all_ngrams = pd.concat(results, ignore_index=True)
    #all_ngrams.to_csv('yearly_ngram_frequencies.csv', index=False)

    # Find top trending terms
    trending = find_trending_terms(all_ngrams, min_years=2, top_n=1000)
    trending.to_csv('trending_terms.csv', index=False)

    # Print results
    print("Top trending terms:")
    for _, r in trending.iterrows():
        print(f"{r.term}: {r.rel_change:.2f}× growth "
              f"({r.first_freq:.2f}→{r.last_freq:.2f} per 100 tweets "
              f"{r.first_year}→{r.last_year})")

if __name__ == '__main__':
    main()


[nltk_data] Downloading package stopwords to /home/shola/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/shola/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Top trending terms:
3dp 3d: 4.11× growth (0.21→1.06 per 100 tweets 2016→2023)
manufacturing 3d: 3.17× growth (0.38→1.58 per 100 tweets 2012→2023)
printer world: 2.83× growth (0.20→0.77 per 100 tweets 2013→2015)
printing rocket: 2.50× growth (0.49→1.71 per 100 tweets 2013→2023)
printing additive: 2.35× growth (0.30→1.01 per 100 tweets 2012→2023)
printing news: 2.32× growth (0.31→1.03 per 100 tweets 2012→2023)
ender 3: 2.10× growth (0.22→0.69 per 100 tweets 2018→2023)
printing heart: 1.80× growth (0.27→0.76 per 100 tweets 2014→2019)
cody wilson: 1.80× growth (0.22→0.60 per 100 tweets 2013→2018)
additive manufacturing: 1.79× growth (3.10→8.66 per 100 tweets 2012→2023)
printing html: 1.56× growth (0.18→0.46 per 100 tweets 2013→2023)
printing parts: 1.42× growth (0.41→0.99 per 100 tweets 2012→2023)
resin 3d: 1.35× growth (0.28→0.65 per 100 tweets 2019→2023)
got 3d: 1.30× growth (0.23→0.52 per 100 tweets 2012→2023)
metal additive: 1.29× growth (0.16→0.37 per 100 tweets 2016→2023)
industry 3d

In [8]:
mask = df['Normalized Text'].str.contains(r'\b3d printing guns\b', case=False, na=False)
print("Rows with ‘3d printing guns’:", mask.sum())
print(df.loc[mask, 'Normalized Text'].unique()[:5])

Rows with ‘3d printing guns’: 226
['looks like 3d printing guns are gonna get a lot more popular tomorrow'
 'do you want to learn about 3d 3d printing guns directly from the builders and experts? \n\nin this thread are all our podcast episodes relating to diy and  3d printing  firearms.\n\nin chronological order'
 "publisher of  3d printing  gun plans says he's ok with consequences of at-home 3d printing guns"
 'publisher of  3d printing  gun plans says he\'ll accept "social consequences" of at-home 3d printing guns'
 'tech vs the state: new jersey bans 10+ round gun magazines, and transmitting 3d 3d printing guns files. \n\ntoday, a member of the pseudonymous group deterrence dispensed released a video of a  3d printing , 30 round capacity gun magazine. \n\nis this 1st amendment, 2nd amendment or both?']
