In [1]:
import pandas as pd
from sklearn.datasets import fetch_20newsgroups




In [2]:
train = fetch_20newsgroups(subset='train')
test = fetch_20newsgroups(subset="test")

X_train = train["data"]
X_test=test['data']
y_train = train["target"] 
y_test=test['target']

train_df=pd.DataFrame(X_train,columns=['text'])
train_df['target']=y_train
train_df['length']=train_df['text'].apply(len)


test_df=pd.DataFrame(X_test,columns=['text'])
test_df['target']=y_test
test_df['length']=test_df['text'].apply(len)


In [18]:
train_df

Unnamed: 0,text,target,length
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,721
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,858
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,1981
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,815
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,1120
...,...,...,...
11309,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13,2159
11310,From: ebodin@pearl.tufts.edu\nSubject: Screen ...,4,823
11311,From: westes@netcom.com (Will Estes)\nSubject:...,3,781
11312,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1,1287


In [8]:
test_df.head()

Unnamed: 0,text,target,length
0,From: v064mb9k@ubvmsd.cc.buffalo.edu (NEIL B. ...,7,695
1,From: Rick Miller <rick@ee.uwm.edu>\nSubject: ...,5,939
2,From: mathew <mathew@mantis.co.uk>\nSubject: R...,0,453
3,From: bakken@cs.arizona.edu (Dave Bakken)\nSub...,17,5239
4,From: livesey@solntze.wpd.sgi.com (Jon Livesey...,19,1007


In [19]:
import string
import re
from sklearn.base import TransformerMixin
import spacy
import en_core_web_lg
nlp = en_core_web_lg.load()

class TextPreprocessor(TransformerMixin):
    def __init__(self, text_attribute):
        self.text_attribute = text_attribute
        
    def transform(self, X, *_):
        X_copy = X.copy()
        X_copy[self.text_attribute] = X_copy[self.text_attribute].apply(self._preprocess_text)
        return X_copy
    
    def _preprocess_text(self, text):
        return self._lemmatize(self._leave_letters_only(self._clean(text)))
    
    def _clean(self, text):
        bad_symbols = '!"#%&\'*+,-<=>?[\\]^_`{|}~'
        text_without_symbols = text.translate(str.maketrans('', '', bad_symbols))

        text_without_bad_words = ''
        for line in text_without_symbols.split('\n'):
            if not line.lower().startswith('from:') and not line.lower().endswith('writes:'):
                text_without_bad_words += line + '\n'

        clean_text = text_without_bad_words
        email_regex = r'([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)'
        regexes_to_remove = [email_regex, r'Subject:', r'Re:']
        for r in regexes_to_remove:
            clean_text = re.sub(r, '', clean_text)

        return clean_text
    
    def _leave_letters_only(self, text):
        text_without_punctuation = text.translate(str.maketrans('', '', string.punctuation))
        return ' '.join(re.findall("[a-zA-Z]+", text_without_punctuation))
    
    def _lemmatize(self, text):
        doc = nlp(text)
        words = [x.lemma_ for x in [y for y in doc if not y.is_stop and y.pos_ != 'PUNCT' 
                                    and y.pos_ != 'PART' and y.pos_ != 'X']]
        return ' '.join(words)
    
    def fit(self, *_):
        return self

In [20]:
text_preprocessor = TextPreprocessor(text_attribute='text')
train_df_preprocessed = text_preprocessor.transform(train_df)

In [21]:
train_df_preprocessed.head()

Unnamed: 0,text,target,length
0,car NntpPostingHost rac Organization Universit...,7,721
1,SI Clock Poll Final Summary final SI clock rep...,4,858
2,PB question Organization Purdue University Eng...,4,1981
3,Weitek P Organization Harris Computer Systems ...,1,815
4,Shuttle Launch Question Organization Smithsoni...,14,1120


In [29]:
datatraincsv = train_df_preprocessed.copy()
#for index,row in datatraincsv.iterrows():
#    datatraincsv["text"][index] = " ".join(row["text"])
datatraincsv.to_csv(path_or_buf="C:/Users/flori/Dropbox/Desktop/bachelorarbeit/realshit/Datasets/20NG/processed/training.csv",columns=["text","target","length"],header=True,index=False,encoding="utf-8",line_terminator="\n")

In [30]:
test_df_preprocessed = text_preprocessor.transform(test_df)
test_df_preprocessed.head()

Unnamed: 0,text,target,length
0,need info Bonneville Organization University B...,7,695
1,XFace Organization Lines Distribution world NN...,5,939
2,strong weak Atheism Organization Mantis Consul...,0,453
3,saudi clergy condemn debut human right group K...,17,5239
4,year Christian Morality Organization sgi Lines...,19,1007


In [31]:
datatestcsv = test_df_preprocessed.copy()
#for index,row in datatraincsv.iterrows():
#    datatraincsv["text"][index] = " ".join(row["text"])
datatestcsv.to_csv(path_or_buf="C:/Users/flori/Dropbox/Desktop/bachelorarbeit/realshit/Datasets/20NG/processed/test.csv",columns=["text","target","length"],header=True,index=False,encoding="utf-8",line_terminator="\n")