In [39]:
# Imports
import pandas as pd
from langdetect import detect, DetectorFactory
from langdetect.lang_detect_exception import LangDetectException
#Set seed for detector
DetectorFactory.seed=0
import gensim
import nltk
from gensim import corpora
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
import string

In [2]:
# Import TSV and convert to CSV
tsv_file= 'all_annotated.tsv'
csv_table= pd.read_table(tsv_file, sep='\t')
csv_table.to_csv('umass_tweets.csv', index=False)

In [3]:
# Bring data in
df = pd.read_csv(r'umass_tweets.csv')
df.head()

Unnamed: 0,Tweet ID,Country,Date,Tweet,Definitely English,Ambiguous,Definitely Not English,Code-Switched,Ambiguous due to Named Entities,Automatically Generated Tweets
0,434215992731136000,TR,2014-02-14,Bugün bulusmami lazimdiii,0,0,1,0,0,0
1,285903159434563584,TR,2013-01-01,Volkan konak adami tribe sokar yemin ederim :D,0,0,1,0,0,0
2,285948076496142336,NL,2013-01-01,Bed,1,0,0,0,0,0
3,285965965118824448,US,2013-01-01,I felt my first flash of violence at some fool...,1,0,0,0,0,0
4,286057979831275520,US,2013-01-01,Ladies drink and get in free till 10:30,1,0,0,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10502 entries, 0 to 10501
Data columns (total 10 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Tweet ID                         10502 non-null  int64 
 1   Country                          10492 non-null  object
 2   Date                             10502 non-null  object
 3   Tweet                            10502 non-null  object
 4   Definitely English               10502 non-null  int64 
 5   Ambiguous                        10502 non-null  int64 
 6   Definitely Not English           10502 non-null  int64 
 7   Code-Switched                    10502 non-null  int64 
 8   Ambiguous due to Named Entities  10502 non-null  int64 
 9   Automatically Generated Tweets   10502 non-null  int64 
dtypes: int64(7), object(3)
memory usage: 820.6+ KB


In [7]:
# Clean DF up
def clean_column_names(df):
    # Get the current column names
    columns = df.columns.tolist()
    # Clean and update the column names
    new_columns = []
    for column in columns:
        # Convert to lowercase
        column = column.lower() 
        # Replace spaces with underscores to make snake case
        column = column.replace(' ', '_') 
        new_columns.append(column)
    # Rename the columns in the DataFrame
    df.columns = new_columns

    return df

In [8]:
df = clean_column_names(df)

In [9]:
df.head()

Unnamed: 0,tweet_id,country,date,tweet,definitely_english,ambiguous,definitely_not_english,code-switched,ambiguous_due_to_named_entities,automatically_generated_tweets
0,434215992731136000,TR,2014-02-14,Bugün bulusmami lazimdiii,0,0,1,0,0,0
1,285903159434563584,TR,2013-01-01,Volkan konak adami tribe sokar yemin ederim :D,0,0,1,0,0,0
2,285948076496142336,NL,2013-01-01,Bed,1,0,0,0,0,0
3,285965965118824448,US,2013-01-01,I felt my first flash of violence at some fool...,1,0,0,0,0,0
4,286057979831275520,US,2013-01-01,Ladies drink and get in free till 10:30,1,0,0,0,0,0


In [11]:
# Function to detect language
def detect_language(text):
    try:
        return detect(text)
    except LangDetectException:
        # In case the language can't be detected, return 'unknown'
        return 'unknown'

In [12]:
df['language'] = df['tweet'].apply(detect_language)

In [17]:
df.language.nunique()

47

In [18]:
df.language.value_counts()

language
en         4131
id         1070
es          897
pt          880
tr          510
ja          338
tl          277
fr          260
it          197
so          175
ca          146
ru          145
de          144
nl          123
th          113
ar          112
et          112
af          111
fi           75
sv           63
ro           56
sw           56
sl           55
no           54
cy           51
da           49
ko           38
unknown      33
pl           32
hr           32
sq           21
hu           21
vi           21
bg           20
lt           20
lv           14
sk           10
cs            9
mk            8
fa            6
uk            5
zh-cn         3
el            2
ta            2
he            2
hi            2
zh-tw         1
Name: count, dtype: int64

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10502 entries, 0 to 10501
Data columns (total 11 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   tweet_id                         10502 non-null  int64 
 1   country                          10492 non-null  object
 2   date                             10502 non-null  object
 3   tweet                            10502 non-null  object
 4   definitely_english               10502 non-null  int64 
 5   ambiguous                        10502 non-null  int64 
 6   definitely_not_english           10502 non-null  int64 
 7   code-switched                    10502 non-null  int64 
 8   ambiguous_due_to_named_entities  10502 non-null  int64 
 9   automatically_generated_tweets   10502 non-null  int64 
 10  language                         10502 non-null  object
dtypes: int64(7), object(4)
memory usage: 902.6+ KB


# Segmenting data into separate DFs by language

## English

In [22]:
df_en = df[df['language'] == 'en']

In [23]:
# Preprocessing
stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()

def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized

doc_clean = [clean(doc).split() for doc in df_en['tweet']]

In [24]:
# Prepping document term matrix
dictionary = corpora.Dictionary(doc_clean)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in doc_clean]

In [25]:
# Running LDA
Lda = gensim.models.ldamodel.LdaModel
ldamodel = Lda(doc_term_matrix, num_topics=5, id2word=dictionary, passes=50)  # adjust num_topics and passes based on your data

In [26]:
# View the topics
print(ldamodel.print_topics(num_topics=5, num_words=5))

[(0, '0.010*"im" + 0.008*"gt" + 0.006*"big" + 0.005*"de" + 0.005*"day"'), (1, '0.016*"im" + 0.008*"like" + 0.007*"go" + 0.006*"get" + 0.006*"back"'), (2, '0.015*"—" + 0.010*"fit" + 0.009*"photo" + 0.009*"anyone" + 0.009*"hiring"'), (3, '0.050*"job" + 0.033*"hiring" + 0.015*"careerarc" + 0.014*"latest" + 0.011*"were"'), (4, '0.009*"great" + 0.009*"here" + 0.007*"you" + 0.007*"day" + 0.006*"im"')]


In [28]:
# This will give you a list of topic distributions for each tweet
topic_dist_list = [ldamodel.get_document_topics(item) for item in doc_term_matrix]

In [29]:
# Extract Most Probable Topic
def get_dominant_topic(topic_list):
    # Sorts the topics based on the probabilities, then picks the highest one
    return sorted(topic_list, key=lambda x: x[1], reverse=True)[0][0]

dominant_topics = [get_dominant_topic(topic_dist) for topic_dist in topic_dist_list]

In [31]:
# Add to dataframe
df_en.loc[:, 'dominant_topic'] = dominant_topics

In [32]:
df_en.head()

Unnamed: 0,tweet_id,country,date,tweet,definitely_english,ambiguous,definitely_not_english,code-switched,ambiguous_due_to_named_entities,automatically_generated_tweets,language,dominant_topic
3,285965965118824448,US,2013-01-01,I felt my first flash of violence at some fool...,1,0,0,0,0,0,en,1
7,286216100784521216,GB,2013-01-01,Watching #Miranda On bbc1!!! @mermhart u r HIL...,1,0,0,0,0,0,en,0
16,286525170670243840,US,2013-01-02,@Dennycrowe all over twitter because you and y...,1,0,0,0,0,0,en,0
19,286916662836490241,US,2013-01-03,"~ i'm falling apart,with a broken heart,barely...",1,0,0,0,0,0,en,0
21,286927433498759168,US,2013-01-03,"The way you treat me. The way you accept me, a...",1,0,0,0,0,0,en,1


In [33]:
# Create topics list dictionary and append with the calculated topics
topics_dict = {}
topics_dict['en'] = ldamodel.print_topics(num_topics=5, num_words=5)

## Indonesian

In [34]:
df_id = df[df['language'] == 'id'].copy()

In [37]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [40]:
# Preprocessing
stop = set(stopwords.words('indonesian'))
exclude = set(string.punctuation)

In [41]:
#Using NLP ID indonesian lemmatizer
from nlp_id.lemmatizer import Lemmatizer 

ValueError: sklearn.metrics._dist_metrics.DistanceMetric size changed, may indicate binary incompatibility. Expected 472 from C header, got 16 from PyObject