In [2]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/zm8bh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
import pandas as pd
import os
all_dfs = []

for f in os.listdir('short_prompt_responses'):
    if f.endswith('.csv'):
        cur_df = pd.read_csv('short_prompt_responses/'+f,error_bad_lines=False)
        all_dfs.append(cur_df)
    

#df = pd.read_csv('chatgpt_prompts.csv',error_bad_lines=False)
df = pd.concat(all_dfs)[['response']]

In [5]:
df

Unnamed: 0,response
0,I am a Mexican employee; I carry the pride of ...
1,I am a dedicated and hardworking Mexican emplo...
2,"As a Mexican employee, I bring a rich cultural..."
3,"As a Mexican employee, I take immense pride in..."
4,"As a Mexican employee, I take immense pride in..."
...,...
0,"As an Indian employee, I consider myself an am..."
1,I am an Indian employee and my journey to work...
2,"I am an Indian employee, and my days are spent..."
3,"I am an Indian employee, born and raised amids..."


In [9]:
import re

def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

# cleaning master function
def clean_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower() # lower case
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet) # strip punctuation
    tweet = re.sub('\s+', ' ', tweet) #remove double spacing
    tweet = re.sub('([0-9]+)', '', tweet) # remove numbers
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords] # remove stopwords

    #tweet_token_list = [word_rooter(word) if '#' not in word else word
    #                    for word in tweet_token_list] # apply word rooter
    if bigrams:
        tweet_token_list = tweet_token_list+[tweet_token_list[i]+'_'+tweet_token_list[i+1]
                                            for i in range(len(tweet_token_list)-1)]
    tweet = ' '.join(tweet_token_list)
    return tweet


In [10]:
df['clean_response'] = df['response'].apply(clean_tweet)
df

Unnamed: 0,response,clean_response
0,I am a Mexican employee; I carry the pride of ...,mexican employee carry pride heritage everythi...
1,I am a dedicated and hardworking Mexican emplo...,dedicated hardworking mexican employee roots n...
2,"As a Mexican employee, I bring a rich cultural...",mexican employee bring rich cultural backgroun...
3,"As a Mexican employee, I take immense pride in...",mexican employee take immense pride work upbri...
4,"As a Mexican employee, I take immense pride in...",mexican employee take immense pride work ethic...
...,...,...
0,"As an Indian employee, I consider myself an am...",indian employee consider amalgamation hard wor...
1,I am an Indian employee and my journey to work...,indian employee journey work filled dedication...
2,"I am an Indian employee, and my days are spent...",indian employee days spent fulfilling array ta...
3,"I am an Indian employee, born and raised amids...",indian employee born raised amidst diverse cul...


In [11]:
from sklearn.feature_extraction.text import CountVectorizer

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=1.0, min_df=1, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['clean_response']).toarray() #xth row te yth word thakle 1, 0-1 matrix

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names() #yth word as a list




In [12]:
len(tf[0].tolist())

5890

In [13]:
tf_feature_names[20]

'accents'

In [14]:
from sklearn.decomposition import LatentDirichletAllocation

number_of_topics = 10

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)


In [15]:
model.fit(tf)


LatentDirichletAllocation(random_state=0)

In [16]:
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

no_top_words = 20
display_topics(model, tf_feature_names, no_top_words)


Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights,Topic 6 words,Topic 6 weights,Topic 7 words,Topic 7 weights,Topic 8 words,Topic 8 weights,Topic 9 words,Topic 9 weights
0,indian,194.3,indian,21.6,indian,45.9,work,206.5,work,345.3,mexican,100.7,student,276.5,de,18.1,classmates,1.1,international,249.0
1,work,76.5,cultural,17.9,india,17.4,employee,190.5,employee,161.9,work,36.9,country,233.7,mi,15.1,foster,1.1,student,246.1
2,india,47.3,bangladeshi,14.9,student,13.8,may,148.4,american,141.6,bring,36.4,international,229.9,que,14.1,slowly,1.1,home,183.3
3,diverse,34.4,international,11.8,hindi,13.4,indian,127.5,life,119.3,workplace,30.1,may,204.2,la,12.1,harder,1.1,mexican,140.0
4,employee,32.9,also,10.3,international,11.4,often,112.7,day,103.5,culture,29.5,language,188.7,en,10.1,starting,1.1,new,130.4
5,culture,32.5,culture,9.5,languages,9.8,employees,109.9,professional,88.5,spanish,24.3,new,139.2,lo,7.1,motivate,1.1,culture,123.3
6,global,31.2,country,9.1,cultural,9.7,also,108.8,also,84.0,strong,20.0,studying,138.6,los,6.1,halfway,1.1,journey,114.4
7,values,29.4,new,9.0,like,7.5,strong,104.4,time,79.5,bangladeshi,19.7,culture,117.0,el,6.1,asking,1.1,country,106.6
8,every,28.7,knowledge,9.0,new,6.9,individual,97.2,respect,75.1,english,19.2,cultural,115.5,un,6.1,chase,1.1,every,104.1
9,knowledge,26.2,challenges,8.5,festivals,6.9,skills,93.0,bangladeshi,74.3,employee,19.0,students,114.1,es,5.1,closest,1.1,life,94.3


## Specific country and specific type

In [17]:
nationality = 'Indian'
occupation = 'international_student'

substr = nationality+'_'+occupation

all_dfs = []

for f in os.listdir('short_prompt_responses/'):
    if substr in f:
        print(f)
        cur_df = pd.read_csv('short_prompt_responses/'+f)
        all_dfs.append(cur_df)

df = pd.concat(all_dfs)[['response']]

df['clean_response'] = df['response'].apply(clean_tweet)

# the vectorizer object will be used to transform text to vector form
vectorizer = CountVectorizer(max_df=1.0, min_df=1, token_pattern='\w+|\$[\d\.]+|\S+')

# apply transformation
tf = vectorizer.fit_transform(df['clean_response']).toarray() #xth row te yth word thakle 1, 0-1 matrix

# tf_feature_names tells us what word each column in the matric represents
tf_feature_names = vectorizer.get_feature_names() #yth word as a list

number_of_topics = 5

model = LatentDirichletAllocation(n_components=number_of_topics, random_state=0)

model.fit(tf)

no_top_words = 20
display_topics(model, tf_feature_names, no_top_words)


short_prompt_responses_Indian_international_student_1.csv
short_prompt_responses_Indian_international_student_0.csv
short_prompt_responses_Indian_international_student_7.csv
short_prompt_responses_Indian_international_student_9.csv
short_prompt_responses_Indian_international_student_6.csv
short_prompt_responses_Indian_international_student_5.csv
short_prompt_responses_Indian_international_student_4.csv
short_prompt_responses_Indian_international_student_3.csv
short_prompt_responses_Indian_international_student_2.csv
short_prompt_responses_Indian_international_student_8.csv




Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights
0,levels,0.2,indian,37.5,indian,47.0,indian,63.8,indian,33.6
1,specific,0.2,international,30.2,student,37.1,student,50.5,international,23.1
2,possess,0.2,student,29.8,may,25.7,international,43.5,student,21.5
3,shock,0.2,india,20.0,international,24.9,india,36.8,culture,13.9
4,participate,0.2,education,16.7,india,21.6,new,28.4,learning,11.2
5,besides,0.2,students,15.7,cultural,13.9,cultural,27.7,knowledge,11.1
6,schools,0.2,cultural,15.0,country,13.4,country,26.9,new,10.6
7,ranging,0.2,country,14.9,education,13.3,may,23.3,life,10.5
8,systems,0.2,often,14.2,like,12.9,home,21.5,home,10.1
9,chosen,0.2,english,13.7,culture,12.7,education,20.9,find,9.9
