In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score

In [6]:
users = pd.read_csv("Twitter-Usernames.csv")
users = users[['submitted username']]
users.head()

Unnamed: 0,submitted username
0,anthonyadragna
1,mirzaaftab
2,stephanieakin
3,madisonalder
4,zeeshanaleem


In [8]:
re_df = pd.read_csv('/home/sagemaker-user/BeltwayJournalists/1123_raw_post_measures.csv')
re_df = re_df[['Id', 'Catergory_1']]
re_df.head()

Unnamed: 0,Id,Catergory_1
0,greta,Television Network News and Public Affairs
1,ZekeJMiller,Wire Services
2,sarahkliff,Digital
3,BCAppelbaum,Newspapers
4,jeneps,News Magazine


In [9]:
tweets = pd.read_csv('User_tweets.csv')
tweets.head()

Unnamed: 0,username,tweet_id,tweet_text
0,ABC7Brad,1340354668564721665,Certainly plenty of bad news in today’s Maryla...
1,ABC7Brad,1340349176085950464,@brucejohnson9 @wusa9 @SEDCScoop @PGCountyScoo...
2,ABC7Brad,1340287339038437378,This is just a tiny fraction of the line on rt...
3,ABC7Brad,1340120610995900418,"I first met @brucejohnson9 when ,many years ag..."
4,ABC7Brad,1340104105772126208,RT @ABC7News: JUST IN via AP: U.S. clears seco...


In [13]:
print(tweets.shape)
print(re_df.shape)
tweets_user = pd.merge(tweets, re_df, how='inner', left_on='username', right_on='Id')
print(tweets_user.shape)
tweets_user.head()

(21799, 3)
(2015, 2)
(17774, 5)


Unnamed: 0,username,tweet_id,tweet_text,Id,Catergory_1
0,ABC7Brad,1340354668564721665,Certainly plenty of bad news in today’s Maryla...,ABC7Brad,Local TV Stations
1,ABC7Brad,1340349176085950464,@brucejohnson9 @wusa9 @SEDCScoop @PGCountyScoo...,ABC7Brad,Local TV Stations
2,ABC7Brad,1340287339038437378,This is just a tiny fraction of the line on rt...,ABC7Brad,Local TV Stations
3,ABC7Brad,1340120610995900418,"I first met @brucejohnson9 when ,many years ag...",ABC7Brad,Local TV Stations
4,ABC7Brad,1340104105772126208,RT @ABC7News: JUST IN via AP: U.S. clears seco...,ABC7Brad,Local TV Stations


In [15]:
print(users.shape)
tweets_users = pd.merge(tweets, users, how='inner', left_on='username', right_on='submitted username')
print(tweets_users.shape)
tweets_users.head()

(3595, 1)
(9098, 4)


Unnamed: 0,username,tweet_id,tweet_text,submitted username
0,aarond,1340739148265779201,@nhannahjones People who’ve never had a linked...,aarond
1,aarond,1340707009449127941,@emilyvdw Girl Timothy isn’t Tamothy?,aarond
2,aarond,1340704147365113859,RT @AOC: One major difference between GOP and ...,aarond
3,aarond,1340701064966901770,I think it’s pretty embarrassing that the expe...,aarond
4,aarond,1340700525101273088,When this is over I hope everybody doesn’t los...,aarond


In [18]:
tweets_with_users = pd.concat([tweets_user, tweets_users])
tweets_with_users = tweets_with_users[['username', 'tweet_text', 'Catergory_1']]
tweets_with_users = tweets_with_users.rename(columns={'Catergory_1': 'Catergory'})
print(tweets_with_users.shape)
tweets_with_users = tweets_with_users.drop_duplicates()
print(tweets_with_users.shape)
tweets_with_users.head()

(26872, 3)
(14647, 3)


Unnamed: 0,username,tweet_text,Catergory
0,ABC7Brad,Certainly plenty of bad news in today’s Maryla...,Local TV Stations
1,ABC7Brad,@brucejohnson9 @wusa9 @SEDCScoop @PGCountyScoo...,Local TV Stations
2,ABC7Brad,This is just a tiny fraction of the line on rt...,Local TV Stations
3,ABC7Brad,"I first met @brucejohnson9 when ,many years ag...",Local TV Stations
4,ABC7Brad,RT @ABC7News: JUST IN via AP: U.S. clears seco...,Local TV Stations


In [19]:
tweets_with_users.to_csv('User-Tweets-Category.csv', index=False)

In [21]:
tweets_with_users['Catergory'].unique()

array(['Local TV Stations', 'News Magazine',
       'Television Network News and Public Affairs', '-', 'Wire Services',
       'Digital', 'Capitol Hill publications', 'Newspapers', nan],
      dtype=object)

In [22]:
def convert_to_num(target):
    if target == 'Local TV Stations':
        return 0
    elif target == 'News Magazine':
        return 1
    elif target == 'Television Network News and Public Affairs':
        return 2
    elif target == 'Wire Services':
        return 3
    elif target == 'Digital':
        return 4
    elif target == 'Capitol Hill publications':
        return 5
    elif target == 'Newspapers':
        return 6
    else:
        return np.nan

In [23]:
tweets_with_users['target'] = tweets_with_users['Catergory'].apply(convert_to_num)

In [27]:
print(tweets_with_users.shape)
training_data = tweets_with_users[tweets_with_users['target'].notna()]
print(training_data.shape)

train_data, test_data = train_test_split(training_data, test_size=0.25, random_state=42)
print(train_data.shape)
print(test_data.shape)

(14647, 4)
(8756, 4)
(6567, 4)
(2189, 4)


In [29]:
apply_data = tweets_with_users[tweets_with_users['target'].isna()]
print(apply_data.shape)
apply_data.head()

(5891, 4)


Unnamed: 0,username,tweet_text,Catergory,target
39,AGKootenay,"RT @Jinga11s: Some of you need cheering up, so...",-,
40,AGKootenay,Update: My child is now beating me in the face...,-,
41,AGKootenay,@mollyknc If S&amp;P wants to hire him as a Ch...,-,
42,AGKootenay,"If you ever need a motivator on deadline, put ...",-,
43,AGKootenay,RT @SPGMIHealthcare: Armed with as much as $50...,-,


In [30]:
#create count object
count_vectorizer = CountVectorizer(stop_words='english')

#Transform the training data using only the 'text' column values
count_X_train = count_vectorizer.fit_transform(train_data['tweet_text'])

#Transform the test data using only the 'text' column values
count_X_test = count_vectorizer.transform(test_data['tweet_text'])

In [31]:
train_x = count_X_train
test_x = count_X_test

In [32]:
y_train = train_data['target']
y_test = test_data['target']

In [33]:
svm_obj = SGDClassifier()

svm = svm_obj.fit(train_x, y_train)

print('Testing Model.')
y_pred = svm.predict(test_x)

acc = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='weighted')
perc = precision_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

#model results
print('Model Results')
print('Accuracy: {}'.format(acc))
print('Recall: {}'.format(recall))
print('Precision: {}'.format(perc))
print('F1-Score: {}'.format(f1))


Testing Model.
Model Results
Accuracy: 0.37368661489264504
Recall: 0.37368661489264504
Precision: 0.37124540884034707
F1-Score: 0.36477261783460857


In [38]:
apply_text = count_vectorizer.transform(apply_data['tweet_text'])
apply_data['predict'] = svm.predict(apply_text)
apply_data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,username,tweet_text,Catergory,target,predict
0,AGKootenay,"RT @Jinga11s: Some of you need cheering up, so...",-,,4.0
1,AGKootenay,Update: My child is now beating me in the face...,-,,2.0
2,AGKootenay,@mollyknc If S&amp;P wants to hire him as a Ch...,-,,2.0
3,AGKootenay,"If you ever need a motivator on deadline, put ...",-,,2.0
4,AGKootenay,RT @SPGMIHealthcare: Armed with as much as $50...,-,,4.0


In [39]:
def convert_to_cat(num):
    if num == 0:
        return 'Local TV Stations'
    elif num == 1:
        return 'News Magazine'
    elif num == 2:
        return 'Television Network News and Public Affairs'
    elif num == 3:
        return 'Wire Services'
    elif num == 4:
        return 'Digital'
    elif num == 5:
        return 'Capitol Hill publications'
    elif num == 6:
        return 'Newspapers'
    else:
        return np.nan

In [52]:
apply_data['count'] = 1
apply_data_group = apply_data.groupby(['username', 'predict'], as_index=False).sum()
apply_data_max = apply_data_group.groupby(['username'], as_index=False)['count'].max()
apply_data_ = pd.merge(apply_data_group, apply_data_max, how='left', on='username')
apply_data_ = apply_data_[apply_data_['count_x'] == apply_data_['count_y']]
apply_data_.head()

Unnamed: 0,username,predict,count_x,count_y
0,AGKootenay,2.0,5,5
5,AMBankstw,4.0,7,7
7,ARiquier,2.0,6,6
14,ASimendinger,6.0,5,5
17,AaronMehta,2.0,3,3


In [53]:
apply_data_ = apply_data_[['username',	'predict']]
apply_data_['Catergory'] = apply_data_['predict'].apply(convert_to_cat)
apply_data_.head()

Unnamed: 0,username,predict,Catergory
0,AGKootenay,2.0,Television Network News and Public Affairs
5,AMBankstw,4.0,Digital
7,ARiquier,2.0,Television Network News and Public Affairs
14,ASimendinger,6.0,Newspapers
17,AaronMehta,2.0,Television Network News and Public Affairs


In [56]:
apply_data_ = apply_data_[['username', 'Catergory']]
existing = training_data[['username', 'Catergory']]
print(existing.shape)
existing = existing.drop_duplicates()
print(existing.shape)
full_data = pd.concat([apply_data_, existing])
print(full_data.shape)
full_data.head()

(8756, 2)
(1098, 2)
(2087, 2)


Unnamed: 0,username,Catergory
0,AGKootenay,Television Network News and Public Affairs
5,AMBankstw,Digital
7,ARiquier,Television Network News and Public Affairs
14,ASimendinger,Newspapers
17,AaronMehta,Television Network News and Public Affairs


In [57]:
full_data.to_csv('Journalist-Groups.csv', index=False)