In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/train.csv')
test = pd.read_csv('/kaggle/input/jigsaw-agile-community-rules/test.csv')

In [3]:
train.keys()

Index(['row_id', 'body', 'rule', 'subreddit', 'positive_example_1',
       'positive_example_2', 'negative_example_1', 'negative_example_2',
       'rule_violation'],
      dtype='object')

In [4]:
train.describe()

Unnamed: 0,row_id,rule_violation
count,2029.0,2029.0
mean,1014.0,0.508132
std,585.866168,0.500057
min,0.0,0.0
25%,507.0,0.0
50%,1014.0,1.0
75%,1521.0,1.0
max,2028.0,1.0


In [5]:
violated_reddit = train[train.rule_violation == 1]

In [6]:
violated_reddit.iloc[0].rule

'No legal advice: Do not offer or request legal advice.'

In [7]:
# do feature extraction and sentimental analysis for positive and negative example 1 and 2
# associate which keyword from column body and subreddit is related to the rule column (only two rules)

In [8]:
train.iloc[0]

row_id                                                                0
body                  Banks don't want you to know this! Click here ...
rule                  No Advertising: Spam, referral links, unsolici...
subreddit                                                    Futurology
positive_example_1    If you could tell your younger self something ...
positive_example_2    hunt for lady for jack off in neighbourhood ht...
negative_example_1    Watch Golden Globe Awards 2017 Live Online in ...
negative_example_2    DOUBLE CEE x BANDS EPPS - "BIRDS"\n\nDOWNLOAD/...
rule_violation                                                        0
Name: 0, dtype: object

In [9]:
train.iloc[0]['body']

"Banks don't want you to know this! Click here to know more!"

In [10]:
train.iloc[0]['positive_example_1']

'If you could tell your younger self something different about sex, what would that be?\n\ni AM IN A CONTEST TO WIN FUNDING FOR MY SEX POSITIVE FILM: VOTE HERE:\n\nhttp://sheknows.offerpop.com/campaign/813112/entry/v144417'

In [11]:
train.iloc[0]['positive_example_2']


'hunt for lady for jack off in neighbourhood http://url.inmusi.com/gakq'

In [12]:
train.iloc[0]['negative_example_1']

'Watch Golden Globe Awards 2017 Live Online in HD Coverage without ADS (VIP STREAMS)\n=\n\nHD STREAM QUALITY >>> [WATCH LINK1](http://forum.submitexpress.com/viewtopic.php?f=9&t=215858)\n=\n\nHD BROADCASTING QUALITY >>> [WATCH LINK1](http://forum.submitexpress.com/viewtopic.php?f=9&t=215858)\n=\n\nMobile Compatibility: YES\n=\n\nNO ADS | NO ADS | ADS\n=\n'

In [13]:
train.iloc[0]['negative_example_2']

'DOUBLE CEE x BANDS EPPS - "BIRDS"\n\nDOWNLOAD/STREAM:\n\nhttp://music.theblacksmithed.com/download/birds/'

In [14]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.sentiment import SentimentIntensityAnalyzer

## preprocess training datasets

In [15]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text) # keep only letters
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopwords.words('english')]  # remove stop words
    return ' '.join(tokens)   

In [16]:
df = train.copy() # improvment: just left body column and label

In [17]:
df['body'] = df['body'].apply(preprocess)

## Sentiment analysis for the body column

In [18]:
sia = SentimentIntensityAnalyzer()

In [19]:
df['sentiment'] = df['body'].apply(lambda x: sia.polarity_scores(x)['compound'])

## Build small training set from examples

In [20]:
pos_eg = df[['positive_example_1', 'positive_example_2']].values.flatten()
neg_eg = df[['negative_example_1', 'negative_example_2']].values.flatten()

df_eg = pd.DataFrame({
    'text': list(pos_eg) + list(neg_eg),
    'label': [1] * len(pos_eg) + [0] * len(neg_eg)
})

df_eg['text'] = df_eg['text'].apply(preprocess)

# sentiment analysis on small datasets
df_eg['sentiment'] = df_eg['text'].apply(lambda x: sia.polarity_scores(x)['compound'])

drop_columns = [col for col in df.keys() if col.startswith(('pos', 'neg', 'row_id'))]

In [21]:
df_eg

Unnamed: 0,text,label,sentiment
0,could tell younger self something different se...,1,0.8126
1,hunt lady jack neighbourhood,1,0.0000
2,wan na kiss stunning,1,0.6597
3,lolgacom one first professional online gold si...,1,0.0000
4,dont break call cops willing get beat stay obv...,1,0.6057
...,...,...,...
8111,great thanks sharing,0,0.8689
8112,cares keep raping hood,0,-0.4215
8113,send private message may able help,0,0.4019
8114,site still works jump gps faq iphone mac,0,0.0000


In [22]:
df = df.drop(drop_columns, axis=1)

In [23]:
df

Unnamed: 0,body,rule,subreddit,rule_violation,sentiment
0,banks dont want know click know,"No Advertising: Spam, referral links, unsolici...",Futurology,0,-0.0572
1,sd stream eng link,"No Advertising: Spam, referral links, unsolici...",soccerstreams,0,0.0000
2,lol try appealing ban say wont,No legal advice: Do not offer or request legal...,pcmasterrace,1,-0.2023
3,come home open legs,"No Advertising: Spam, referral links, unsolici...",sex,1,0.0000
4,code free tyrande imgur friend codes dollars b...,"No Advertising: Spam, referral links, unsolici...",hearthstone,1,0.7579
...,...,...,...,...,...
2024,please edit post readable kids need cps need l...,No legal advice: Do not offer or request legal...,relationships,1,0.3182
2025,yes right work state even fire reason tell tho...,No legal advice: Do not offer or request legal...,legaladvice,0,0.3400
2026,hd streams eng hd watch herepc mobile ad overl...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,1,0.4019
2027,obviously presents safety hazard universities ...,No legal advice: Do not offer or request legal...,politics,1,0.0772


In [24]:
df['rule'].unique()

array(['No Advertising: Spam, referral links, unsolicited advertising, and promotional content are not allowed.',
       'No legal advice: Do not offer or request legal advice.'],
      dtype=object)

In [25]:
df

Unnamed: 0,body,rule,subreddit,rule_violation,sentiment
0,banks dont want know click know,"No Advertising: Spam, referral links, unsolici...",Futurology,0,-0.0572
1,sd stream eng link,"No Advertising: Spam, referral links, unsolici...",soccerstreams,0,0.0000
2,lol try appealing ban say wont,No legal advice: Do not offer or request legal...,pcmasterrace,1,-0.2023
3,come home open legs,"No Advertising: Spam, referral links, unsolici...",sex,1,0.0000
4,code free tyrande imgur friend codes dollars b...,"No Advertising: Spam, referral links, unsolici...",hearthstone,1,0.7579
...,...,...,...,...,...
2024,please edit post readable kids need cps need l...,No legal advice: Do not offer or request legal...,relationships,1,0.3182
2025,yes right work state even fire reason tell tho...,No legal advice: Do not offer or request legal...,legaladvice,0,0.3400
2026,hd streams eng hd watch herepc mobile ad overl...,"No Advertising: Spam, referral links, unsolici...",soccerstreams,1,0.4019
2027,obviously presents safety hazard universities ...,No legal advice: Do not offer or request legal...,politics,1,0.0772


In [26]:
df['body'] = df['body'] + ' ' + df['subreddit']

df_eg['rule'] = None

df = df.drop('subreddit', axis=1)

In [27]:
df

Unnamed: 0,body,rule,rule_violation,sentiment
0,banks dont want know click know Futurology,"No Advertising: Spam, referral links, unsolici...",0,-0.0572
1,sd stream eng link soccerstreams,"No Advertising: Spam, referral links, unsolici...",0,0.0000
2,lol try appealing ban say wont pcmasterrace,No legal advice: Do not offer or request legal...,1,-0.2023
3,come home open legs sex,"No Advertising: Spam, referral links, unsolici...",1,0.0000
4,code free tyrande imgur friend codes dollars b...,"No Advertising: Spam, referral links, unsolici...",1,0.7579
...,...,...,...,...
2024,please edit post readable kids need cps need l...,No legal advice: Do not offer or request legal...,1,0.3182
2025,yes right work state even fire reason tell tho...,No legal advice: Do not offer or request legal...,0,0.3400
2026,hd streams eng hd watch herepc mobile ad overl...,"No Advertising: Spam, referral links, unsolici...",1,0.4019
2027,obviously presents safety hazard universities ...,No legal advice: Do not offer or request legal...,1,0.0772


In [28]:
df_eg = df_eg.rename(columns={'text':'body', 'label':'rule_violation'})

In [29]:
df_train = pd.concat([df, df_eg], ignore_index=True)
X = df_train.drop(columns = 'rule_violation')

In [30]:
X = pd.get_dummies(X, columns=['rule'])

In [31]:
y = df_train['rule_violation']

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [33]:
tfdif = TfidfVectorizer()

In [34]:
loreg = LogisticRegression()

In [35]:
X_transform = tfdif.fit_transform(X['body'])

In [36]:
tfdif.get_feature_names_out()

array(['2007scape', 'aaron', 'ab', ..., 'zones', 'zurich', 'zven'],
      dtype=object)

In [37]:
X_transform.shape

(10145, 7026)

In [38]:
y.shape

(10145,)

In [39]:
loreg.fit(X_transform, y)

In [40]:
y_pred = loreg.predict(X_transform)

In [41]:
print(classification_report(y, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.97      0.97      5056
           1       0.97      0.98      0.97      5089

    accuracy                           0.97     10145
   macro avg       0.97      0.97      0.97     10145
weighted avg       0.97      0.97      0.97     10145



In [42]:
# apply train test split
X_train, X_test, y_train, y_test = train_test_split(X_transform, y, test_size=0.3)

In [43]:
loreg.fit(X_train, y_train)

In [44]:
y_pred = loreg.predict(X_test)

In [45]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96      1453
           1       0.96      0.96      0.96      1591

    accuracy                           0.96      3044
   macro avg       0.96      0.96      0.96      3044
weighted avg       0.96      0.96      0.96      3044



In [46]:
confusion_matrix(y_test, y_pred)

array([[1395,   58],
       [  64, 1527]])

In [47]:
# prepare test dataset

def preprocess(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = re.sub(r'[^a-z\s]', '', text) # keep only letters
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stopwords.words('english')]  # remove stop words
    return ' '.join(tokens)   

# split dataset into two
def build_dataset(df):

    columns_name = [col for col in df.keys() if col.startswith(('pos', 'neg'))]

    # first dataset
    df = df.drop('row_id', axis=1)
    df = df.drop(columns_name, axis=1)
    df['body'] = df['body'].apply(preprocess)
    sia = SentimentIntensityAnalyzer()
    df['sentiment'] = df['body'].apply(lambda x: sia.polarity_scores(x)['compound'])
    df['body'] = df['body'] + ' ' + df['subreddit']
    df = df.drop('subreddit', axis=1)
    df = pd.get_dummies(df, columns=['rule'])
    return df

    # # second dataset
    # pos_eg = df[['positive_example_1', 'positive_example_2']].values.flatten()
    # neg_eg = df[['negative_example_1', 'negative_example_2']].values.flatten()
    # df_eg = pd.DataFrame({
    #     'body': list(pos_eg) + list(neg_eg),
    #     'rule_violation': [1] * len(pos_eg) + [0] * len(neg_eg)
    # })
    # df_eg['body'] = df_eg['body'].apply(preprocess)
    # df_eg['sentiment'] = df_eg['body'].apply(lambda x: sia.polarity_scores(x)['compound'])
    # df_eg['rule'] = None

    # # merge dataset
    # return pd.concat([df1, df_eg], ignore_index = True)


In [48]:
X_submit = build_dataset(test)

In [49]:
X_submit

Unnamed: 0,body,sentiment,"rule_No Advertising: Spam, referral links, unsolicited advertising, and promotional content are not allowed.",rule_No legal advice: Do not offer or request legal advice.
0,new rap group check us hiphopheads,0.0,True,False
1,make life comfortable get discount pain killer...,-0.8481,False,True
2,kickin ass selling underwear made account last...,-0.5423,True,False
3,watch hooters best therein personalfinance,0.6369,True,False
4,bitches free point show Showerthoughts,-0.1531,True,False
5,top pentakills august please subscribe every d...,0.4767,True,False
6,win samsung smartwatch enter email adress foll...,0.9062,True,False
7,mixtape lit fam BlackPeopleTwitter,0.0,True,False
8,must watch movie movies,0.0,True,False
9,free paypal cards pics,0.5106,True,False


In [50]:
X_submit_transform = tfdif.transform(X_submit['body'])

In [51]:
y_pred = loreg.predict(X_submit_transform)

In [52]:
y_submit = pd.DataFrame({
    'row_id': test['row_id'],
    'rule_violation': y_pred
})

In [53]:
y_submit

Unnamed: 0,row_id,rule_violation
0,2029,0
1,2030,0
2,2031,1
3,2032,1
4,2033,1
5,2034,0
6,2035,1
7,2036,0
8,2037,0
9,2038,1


In [54]:
y_submit.to_csv('/kaggle/working/submission.csv')