In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
data = fetch_20newsgroups()

In [3]:
data.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

In [4]:
categories = ['soc.religion.christian', 'misc.forsale', 'sci.electronics', 'rec.motorcycles']

In [5]:
train = fetch_20newsgroups(categories=categories, random_state=42, subset='train')

In [6]:
test = fetch_20newsgroups(categories=categories, random_state=42, subset='test')

In [11]:
test_data = train.data[1]

In [12]:
test_data.split('\n')

['From: gerald.belton@ozonehole.com (Gerald Belton) ',
 'Subject: Need to find out numb',
 'Distribution: world',
 'Organization: Ozone Online Operations, Inc. - New Orleans, LA',
 'Reply-To: gerald.belton@ozonehole.com (Gerald Belton) ',
 'Lines: 24',
 '',
 'AL>>        Question:   Is there a certain device out there that I can',
 'AL>>                    use to find out the number to the line?',
 'AL>>        Thanks for any response.',
 'AL>>                                                    Al',
 '',
 'AL>There is a number you can call which will return a synthesized',
 'AL>voice telling you the number of the line.  Unfortunately, for the',
 "AL>life of me I can't remember what it is. The telephone technicians",
 'AL>use it all the time.  We used to play around with this in our',
 'AL>dorm rooms since there were multiple phone lines running between',
 'AL>rooms.',
 '',
 "It probably wouldn't help for you to post the number, since it appears",
 "to be different in each area.  For wh

In [13]:
from sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer, strip_newsgroup_header

In [25]:
d = strip_newsgroup_footer(train.data[0])

In [28]:
d

'ITEM: Klipsch Forte 2 Speakers\nCONDITION: Mint\nAGE: 6 months old\n\nPRICE: $1000/pair (retail: $1400/pair)'

In [27]:
d = strip_newsgroup_header(d)

In [20]:
d.replace('\n','')

"AL>>        Question:   Is there a certain device out there that I canAL>>                    use to find out the number to the line?AL>>        Thanks for any response.AL>>                                                    AlAL>There is a number you can call which will return a synthesizedAL>voice telling you the number of the line.  Unfortunately, for theAL>life of me I can't remember what it is. The telephone techniciansAL>use it all the time.  We used to play around with this in ourAL>dorm rooms since there were multiple phone lines running betweenAL>rooms.It probably wouldn't help for you to post the number, since it appearsto be different in each area.  For what it's worth, in the New Orleansarea the number is 998-877-6655 (easy to remember, what?) * SLMR 2.1 * Ask me anything: if I don't know, I'll make up something.                                          "

In [90]:
from sklearn.base import BaseEstimator, TransformerMixin

class BodyExtractor(BaseEstimator, TransformerMixin):
    def fit(self,x,y=None):
        return self
    
#     def transform(self,posts):
#         bodies = []
#         for post in posts:
            
#             st = post[post.find('Subject')+len('Subject: '):]
#             end = st.find('\n')
#             subject = st[:end]
            
#             body = strip_newsgroup_footer(post)
#             body = strip_newsgroup_header(body)
#             body = body.replace('\n','')
            
#             bodies.append({'subject':subject, 'body':body})
#         return bodies

    def transform(self,post):

        st = post[post.find('Subject')+len('Subject: '):]
        end = st.find('\n')
        subject = st[:end]
            
        body = strip_newsgroup_footer(post)
        body = strip_newsgroup_header(body)
        body = body.replace('\n','')
            
        return {'subject':subject, 'body':body}
        
        

In [91]:
be = BodyExtractor()

In [68]:
for e in be.transform(train.data[1:4]):
    print (e,'\n')

{'subject': 'Need to find out numb', 'body': "AL>>        Question:   Is there a certain device out there that I canAL>>                    use to find out the number to the line?AL>>        Thanks for any response.AL>>                                                    AlAL>There is a number you can call which will return a synthesizedAL>voice telling you the number of the line.  Unfortunately, for theAL>life of me I can't remember what it is. The telephone techniciansAL>use it all the time.  We used to play around with this in ourAL>dorm rooms since there were multiple phone lines running betweenAL>rooms.It probably wouldn't help for you to post the number, since it appearsto be different in each area.  For what it's worth, in the New Orleansarea the number is 998-877-6655 (easy to remember, what?) * SLMR 2.1 * Ask me anything: if I don't know, I'll make up something.                                          "} 

{'subject': 'Re: What is Zero dB????', 'body': ">>: The Ohmite company 

In [58]:
d = train.data[0]

In [49]:
d.find('Subject')

45

In [59]:
st = d[d.find('Subject')+len('Subject: '):]

In [62]:
st.find('\n')

28

In [56]:
st[:28]

'Klipsch Forte 2 SPKRS 4 Sale'

In [69]:
data_tf = be.transform(train.data)

In [70]:
import pandas as pd

In [71]:
df = pd.DataFrame(data_tf)

In [72]:
df.head()

Unnamed: 0,body,subject
0,ITEM: Klipsch Forte 2 SpeakersCONDITION: MintA...,Klipsch Forte 2 SPKRS 4 Sale
1,AL>> Question: Is there a certain dev...,Need to find out numb
2,>>: The Ohmite company was the first to charac...,Re: What is Zero dB????
3,Recently I posted two oscilliscopes for sale. ...,Oscilliscopes for sale
4,,Forsale: Sony D-22 diskman


In [87]:
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
        
    def fit(self,x,y=None):
        return self
    
    def transform(self, data_dict):
        print (type(data_dict))
        return data_dict[self.key]

In [78]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC

In [92]:
pipeline = Pipeline([
    ('subjectbody', BodyExtractor()),
    
    ('union', FeatureUnion(
        transformer_list=[
            ('subject', Pipeline([
                ('selector',ItemSelector(key='subject')),
                ('tfidf',TfidfVectorizer(min_df=50)),
            ])),
            
            ('body', Pipeline([
                ('selector', ItemSelector(key='body')),
                ('tfidf', TfidfVectorizer()),
            ])),
        ])),
    ('svc', SVC(kernel='linear'))
    
])

In [93]:
pipeline.fit(train.data, train.target)

AttributeError: 'list' object has no attribute 'find'

In [94]:
import sklearn-pandas

SyntaxError: invalid syntax (<ipython-input-94-4be7856840ae>, line 1)