Crash with libCurl 7.57.0 and CURL_LOCK_DATA_CONNECT #2132

Closed
patrickdawson opened this Issue Dec 1, 2017 · 23 comments

Comments

Projects
None yet
3 participants
@patrickdawson

patrickdawson commented Dec 1, 2017

I did this

I tried the new option CURLSHOPT_SHARE with parameter CURL_LOCK_DATA_CONNECT available in libcurl 7.57.0 according to https://curl.haxx.se/dev/release-notes.html. If I use this option I get random crashes in my application with different call stacks.

Here is an example program to reproduce the problem:

#include <windows.h>
#include <iostream>
#include <tchar.h>
#include <strsafe.h>
#include <curl/curl.h>

using namespace std;

const size_t MAX_THREADS = 3;
const size_t ITERATIONS = 100;

#define CHECK_CURL(cmd) \
    if ((cmd) != CURLE_OK) \
    { \
        throw runtime_error( "Error executing: "#cmd ); \
    }

DWORD WINAPI MyThreadFunction( LPVOID lpParam );

HANDLE ghMutex;

static void ShareLockFunc( CURL* pHandle, curl_lock_data Data, curl_lock_access Access, void* pUseptr )
{
    (void)pHandle;
    (void)Data;
    (void)Access;
    (void)pUseptr;
    WaitForSingleObject(
        ghMutex,    // handle to mutex
        INFINITE );  // no time-out interval
}

static void ShareUnlockFunc( CURL* pHandle, curl_lock_data Data, void* pUseptr )
{
    (void)pHandle;
    (void)Data;
    (void)pUseptr;
    ReleaseMutex( ghMutex );
}

typedef struct MyData {
    CURLSH* pShare;
} MYDATA, *PMYDATA;

int CurlPerformGet( CURL* pHandle )
{
    CHECK_CURL( curl_easy_setopt( pHandle, CURLOPT_FOLLOWLOCATION, 1L ) );
    CHECK_CURL( curl_easy_setopt( pHandle, CURLOPT_URL, "http://127.0.0.1:3000" ) );
    CHECK_CURL( curl_easy_perform( pHandle ) );
    int StatusCode = 0;
    CHECK_CURL( curl_easy_getinfo( pHandle, CURLINFO_RESPONSE_CODE, &StatusCode ) );
    return StatusCode;
}

int _tmain()
{
    PMYDATA pDataArray[MAX_THREADS];
    DWORD   dwThreadIdArray[MAX_THREADS];
    HANDLE  hThreadArray[MAX_THREADS];

    ghMutex = CreateMutex(
        NULL,              // default security attributes
        FALSE,             // initially not owned
        NULL );            // unnamed mutex

    if (ghMutex == NULL)
    {
        printf( "CreateMutex error: %d\n", GetLastError() );
        return 1;
    }

    (void)curl_global_init( CURL_GLOBAL_ALL );

    CURLSH *pShare = curl_share_init();
    curl_share_setopt( pShare, CURLSHOPT_SHARE, CURL_LOCK_DATA_CONNECT );
    curl_share_setopt( pShare, CURLSHOPT_LOCKFUNC, ShareLockFunc );
    curl_share_setopt( pShare, CURLSHOPT_UNLOCKFUNC, ShareUnlockFunc );

    // Create MAX_THREADS worker threads.
    for (int i = 0; i < MAX_THREADS; i++)
    {
        pDataArray[i] = (PMYDATA)HeapAlloc( GetProcessHeap(), HEAP_ZERO_MEMORY,
            sizeof( MYDATA ) );

        pDataArray[i]->pShare = pShare;

        if (pDataArray[i] == NULL)
        {
            ExitProcess( 2 );
        }

        hThreadArray[i] = CreateThread(
            NULL,                   // default security attributes
            0,                      // use default stack size
            MyThreadFunction,       // thread function name
            pDataArray[i],
            0,                      // use default creation flags
            &dwThreadIdArray[i] );   // returns the thread identifier

        if (hThreadArray[i] == NULL)
        {
            cout << "Create thread failed" << endl;
            ExitProcess( 3 );
        }
    }


    WaitForMultipleObjects( MAX_THREADS, hThreadArray, TRUE, INFINITE );

    for (int i = 0; i < MAX_THREADS; i++)
    {
        CloseHandle( hThreadArray[i] );
    }

    curl_share_cleanup( pShare );
    curl_global_cleanup();

    CloseHandle( ghMutex );

    return 0;
}


DWORD WINAPI MyThreadFunction( LPVOID lpParam )
{
    PMYDATA pData = (PMYDATA)lpParam;
    CURLSH* pShare = pData->pShare;

    for (size_t i = 0; i < ITERATIONS; ++i)
    {
        CURL* pHandle = curl_easy_init();
        curl_easy_setopt( pHandle, CURLOPT_SHARE, pShare );
        cout << "Status: " << CurlPerformGet( pHandle ) << endl;
        curl_easy_cleanup( pHandle );
    }

    return 0;
}

As test server I used a small nodejs express server:

const express = require('express');

const app = express();
let requestsCount = 0;

process.on('uncaughtException', function (err) {
    console.log("uncaught exception");
    console.log(err);
});

app.get('/', (req, res) => {
    console.log(req.headers);
    requestsCount += 1;
    setTimeout(() => {
        res.send('Hello World!');
    }, 1);
});

app.use(function (err, req, res, next) {
    console.error("Express error catched!", err);
    res.status(500).send("Error happened");
});


const server = app.listen(3000, () => {
    console.log('Example app listening on port 3000!');
});

let count = 0;
server.on("connection", (socket) => {
    console.log(`GOT CONNECTION ${count++} on addr: ${JSON.stringify(socket.address())}`);
    socket.setKeepAlive(true);
});

const interval = 2000;
setInterval(() => {
    console.log(`got ${requestsCount} requests in ${interval/1000}s`);
    requestsCount = 0;
}, interval);

Here are the callstacks from the crashes I got:

Callstack 1 (this one happens most of the time) with the following error:
free(ca->ai_addr); // Read access violation, ca is 0x1

libcurl.dll!Curl_freeaddrinfo(Curl_addrinfo * cahead) Line 84      C
libcurl.dll!freednsentry(void * freethis) Line 744               C
libcurl.dll!hash_element_dtor(void * user, void * element) Line 41          C
libcurl.dll!Curl_llist_remove(curl_llist * list, curl_llist_element * e, void * user) Line 130 C
libcurl.dll!Curl_llist_destroy(curl_llist * list, void * user) Line 138               C
libcurl.dll!Curl_hash_destroy(curl_hash * h) Line 213      C
libcurl.dll!curl_multi_cleanup(Curl_multi * multi) Line 2237         C
libcurl.dll!Curl_close(Curl_easy * data) Line 327                C
HttpClientDirectTest2.exe!MyThreadFunction(void * lpParam) Line 134 C++

Callstack 2 (less frequently than callstack 1 in my tests) with the following error:
diff = Curl_timediff(node->time, now); // node is 0

libcurl.dll!add_next_timeout(curltime now, Curl_multi * multi, Curl_easy * d) Line 2483               C
libcurl.dll!curl_multi_perform(Curl_multi * multi, int * running_handles) Line 2187         C
libcurl.dll!easy_transfer(Curl_multi * multi) Line 683      C
libcurl.dll!easy_perform(Curl_easy * data, bool events) Line 769               C
libcurl.dll!curl_easy_perform(Curl_easy * data) Line 788               C
HttpClientDirectTest2.exe!CurlPerformGet(void * pHandle) Line 49         C++
HttpClientDirectTest2.exe!MyThreadFunction(void * lpParam) Line 133 C++

I expected the following

I expected that the application does not crash and that the sockets are reused so that the operating system does not report 300 WAITING sockets after the application has finished.

curl/libcurl version

7.57.0

operating system

Windows 7 Enterprise Service Pack 1

@jay jay added the crash label Dec 1, 2017

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 1, 2017

Member

I wrote up a pthreads-using alternative version, which runs 37 threads, each thread doing 11 loops and they're all downloading a 512MB file from http://localhost/...

I didn't manage to reproduce the crash but I've starred at the stack traces above and read code and I think I know what's going on. The problem is in curl_multi_cleanup that kills connections without locking.

Member

bagder commented Dec 1, 2017

I wrote up a pthreads-using alternative version, which runs 37 threads, each thread doing 11 loops and they're all downloading a 512MB file from http://localhost/...

I didn't manage to reproduce the crash but I've starred at the stack traces above and read code and I think I know what's going on. The problem is in curl_multi_cleanup that kills connections without locking.

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 1, 2017

Member

No, I was wrong. The connection cache that is initialized and kept in the multi handle is not used when the shared connection is used so that's not it...

Member

bagder commented Dec 1, 2017

No, I was wrong. The connection cache that is initialized and kept in the multi handle is not used when the shared connection is used so that's not it...

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 1, 2017

Member

The first stack trace points at the "host cache" that seems to cause the issue: Curl_hash_destroy(&multi->hostcache); in multi.c. But the host cache is not the connection cache... That rather looks like some memory corruption or similar.

Member

bagder commented Dec 1, 2017

The first stack trace points at the "host cache" that seems to cause the issue: Curl_hash_destroy(&multi->hostcache); in multi.c. But the host cache is not the connection cache... That rather looks like some memory corruption or similar.

@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 1, 2017

I tested it just yet at home with macOS. There is no crash either. Seems to happen on windows only.

I tested it just yet at home with macOS. There is no crash either. Seems to happen on windows only.

@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 2, 2017

This is the version of the testprogram I ran on macos:
https://gist.github.com/patrickdawson/da3ff09835dfc0c19caf2fd94c105f18

I tried this version on windows 10 with Visual Studio 2015 as well: on windows this version of the program crashes nearly every time.

This is the version of the testprogram I ran on macos:
https://gist.github.com/patrickdawson/da3ff09835dfc0c19caf2fd94c105f18

I tried this version on windows 10 with Visual Studio 2015 as well: on windows this version of the program crashes nearly every time.

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 2, 2017

Member

Interesting. Some questions:

  1. If you remove the sharing of the connection cache I assume it works fine?
  2. What if you remove the redirect-following does that change behavior? Maybe I should add that to my test case and see if that changes anything.
  3. This is 10 threads doing 100 iterations each. Do you figure that's necessary to trigger the problem or does it also repeat if you just use two threads?
Member

bagder commented Dec 2, 2017

Interesting. Some questions:

  1. If you remove the sharing of the connection cache I assume it works fine?
  2. What if you remove the redirect-following does that change behavior? Maybe I should add that to my test case and see if that changes anything.
  3. This is 10 threads doing 100 iterations each. Do you figure that's necessary to trigger the problem or does it also repeat if you just use two threads?
@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 2, 2017

  1. Yes. If I remove the connection cache everything works fine (except the socket explosion ;) )
  2. Removing redirect-following has no effect. It crashes without that setting as well.
  3. No it is not necessary. I can still reproduce crashes with 2 threads but they happen less frequently then.
  1. Yes. If I remove the connection cache everything works fine (except the socket explosion ;) )
  2. Removing redirect-following has no effect. It crashes without that setting as well.
  3. No it is not necessary. I can still reproduce crashes with 2 threads but they happen less frequently then.

@bagder bagder changed the title from Problem/Crash with libCurl 7.57.0 and Option CURLSHOPT_SHARE with Parameter CURL_LOCK_DATA_CONNECT to Crash with libCurl 7.57.0 and CURL_LOCK_DATA_CONNECT Dec 2, 2017

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 2, 2017

Member

I have an idea. Will offer a test patch soon.

Member

bagder commented Dec 2, 2017

I have an idea. Will offer a test patch soon.

bagder added a commit that referenced this issue Dec 2, 2017

conncache: hold the lock longer when fiddling with the bundles
If the lock is released before the dealings with the bundle is over, it may
have changed by another thread in the mean time.

Fixes #2132
@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 2, 2017

Member

It'd be great if you could try out a build with the #2139 change applied!

Member

bagder commented Dec 2, 2017

It'd be great if you could try out a build with the #2139 change applied!

@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 2, 2017

Thanks for the fast patch. I have some interesting new fact: the patch you provided seems to make the example crash free as long as I use a maximum of 5 threads. After the example is finished I have 5 sockets in windows that are in state WAITING according to netstat.
As soon as I use more threads the crashes start to happen again. What I also see is that the WAITING sockets get out of control (several hundred waiting sockets) when the app crashes.
These 5 sockets seem to perfectly match the default connection pool cache size.

Thanks for the fast patch. I have some interesting new fact: the patch you provided seems to make the example crash free as long as I use a maximum of 5 threads. After the example is finished I have 5 sockets in windows that are in state WAITING according to netstat.
As soon as I use more threads the crashes start to happen again. What I also see is that the WAITING sockets get out of control (several hundred waiting sockets) when the app crashes.
These 5 sockets seem to perfectly match the default connection pool cache size.

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 2, 2017

Member

I found another suspicious detail that I pushed to the PR branch.

When scrutinizing this functionality closer, it also struck me that it will behave badly for the cases where pipelining or h2 is attempted on a connection that is owned by another thread. We need to make sure that those features are limited to easy handles that are part of the same multi handle. But this crash isn't using either of those functions...

Member

bagder commented Dec 2, 2017

I found another suspicious detail that I pushed to the PR branch.

When scrutinizing this functionality closer, it also struck me that it will behave badly for the cases where pipelining or h2 is attempted on a connection that is owned by another thread. We need to make sure that those features are limited to easy handles that are part of the same multi handle. But this crash isn't using either of those functions...

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 4, 2017

Member

Please try again with my additional teeny weeny fix added. It is a bit tricky for me since I can't reproduce this easily - which given the flaws I found I honestly can't really understand why I can't more easily... :-/

Member

bagder commented Dec 4, 2017

Please try again with my additional teeny weeny fix added. It is a bit tricky for me since I can't reproduce this easily - which given the flaws I found I honestly can't really understand why I can't more easily... :-/

@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 4, 2017

Yes I would but I don't see any commit on that branch. Which commit (fix) do you mean?

Yes I would but I don't see any commit on that branch. Which commit (fix) do you mean?

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 4, 2017

Member

There are three commits in that branch now as #2139 shows. The followup is 3dee4fc and then I fixed the test case in the 3rd commit but that's mostly to make the build green again.

Member

bagder commented Dec 4, 2017

There are three commits in that branch now as #2139 shows. The followup is 3dee4fc and then I fixed the test case in the 3rd commit but that's mostly to make the build green again.

@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 4, 2017

Still getting crashes but less frequently.

Callstack is:

libcurl.dll!conn_free(connectdata * conn) Line 726	C
libcurl.dll!Curl_disconnect(connectdata * conn, bool dead_connection) Line 795	C
libcurl.dll!multi_done(connectdata * * connp, CURLcode status, bool premature) Line 616	C
libcurl.dll!multi_runsingle(Curl_multi * multi, curltime now, Curl_easy * data) Line 1710	C
libcurl.dll!curl_multi_perform(Curl_multi * multi, int * running_handles) Line 2168	C
libcurl.dll!easy_transfer(Curl_multi * multi) Line 694	C
libcurl.dll!easy_perform(Curl_easy * data, bool events) Line 780	C
libcurl.dll!curl_easy_perform(Curl_easy * data) Line 799	C
HttpClientDirectTest2.exe!CurlPerformGet(void * pHandle) Line 193	C++
HttpClientDirectTest2.exe!MyThreadFunc(void * pShare) Line 205	C++

And I still see a lot of WAITING sockets per run. If the connection pool cache size is set to its default value 5 shouldn't there be a maximum of 5 waiting sockets after each run?

patrickdawson commented Dec 4, 2017

Still getting crashes but less frequently.

Callstack is:

libcurl.dll!conn_free(connectdata * conn) Line 726	C
libcurl.dll!Curl_disconnect(connectdata * conn, bool dead_connection) Line 795	C
libcurl.dll!multi_done(connectdata * * connp, CURLcode status, bool premature) Line 616	C
libcurl.dll!multi_runsingle(Curl_multi * multi, curltime now, Curl_easy * data) Line 1710	C
libcurl.dll!curl_multi_perform(Curl_multi * multi, int * running_handles) Line 2168	C
libcurl.dll!easy_transfer(Curl_multi * multi) Line 694	C
libcurl.dll!easy_perform(Curl_easy * data, bool events) Line 780	C
libcurl.dll!curl_easy_perform(Curl_easy * data) Line 799	C
HttpClientDirectTest2.exe!CurlPerformGet(void * pHandle) Line 193	C++
HttpClientDirectTest2.exe!MyThreadFunc(void * pShare) Line 205	C++

And I still see a lot of WAITING sockets per run. If the connection pool cache size is set to its default value 5 shouldn't there be a maximum of 5 waiting sockets after each run?

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 4, 2017

Member

okay thanks, then I think we consider that a small step in the right direction at least...

Member

bagder commented Dec 4, 2017

okay thanks, then I think we consider that a small step in the right direction at least...

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 4, 2017

Member

If the connection pool cache size is set to its default value 5 shouldn't there be a maximum of 5 waiting sockets after each run?

With how many threads? When the pool fits 5 connections that are all in use when the 6th request ends, it will close that 6th connection. So if you do 100 parallel connections that all end at the same time you'd get about 95 closed connections at once and 5 kept alive in the pool.

Member

bagder commented Dec 4, 2017

If the connection pool cache size is set to its default value 5 shouldn't there be a maximum of 5 waiting sockets after each run?

With how many threads? When the pool fits 5 connections that are all in use when the 6th request ends, it will close that 6th connection. So if you do 100 parallel connections that all end at the same time you'd get about 95 closed connections at once and 5 kept alive in the pool.

@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 4, 2017

I referenced my example https://gist.github.com/patrickdawson/da3ff09835dfc0c19caf2fd94c105f18, so with 100 threads. Than I misunderstood the option.
But it looks like these additional sockets cause problems in multithreaded environments in windows because I didn't see any crashed with 5 threads and less with your patch.

I referenced my example https://gist.github.com/patrickdawson/da3ff09835dfc0c19caf2fd94c105f18, so with 100 threads. Than I misunderstood the option.
But it looks like these additional sockets cause problems in multithreaded environments in windows because I didn't see any crashed with 5 threads and less with your patch.

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 4, 2017

Member

Yeah, I have more work coming up to test soon...

Member

bagder commented Dec 4, 2017

Yeah, I have more work coming up to test soon...

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 4, 2017

Member

I updated the PR again...

Member

bagder commented Dec 4, 2017

I updated the PR again...

@jay

This comment has been minimized.

Show comment Hide comment
@jay

jay Dec 5, 2017

Member

Note MAX_THREADS cannot be greater than MAXIMUM_WAIT_OBJECTS, otherwise WaitForMultipleObjects won't wait for all the threads and crashes will occur for other reasons. I suggest assert(MAX_THREADS <= MAXIMUM_WAIT_OBJECTS); if anyone is playing around with the numbers like I am.

Member

jay commented Dec 5, 2017

Note MAX_THREADS cannot be greater than MAXIMUM_WAIT_OBJECTS, otherwise WaitForMultipleObjects won't wait for all the threads and crashes will occur for other reasons. I suggest assert(MAX_THREADS <= MAXIMUM_WAIT_OBJECTS); if anyone is playing around with the numbers like I am.

@patrickdawson

This comment has been minimized.

Show comment Hide comment
@patrickdawson

patrickdawson Dec 5, 2017

Hey. I ran my test program (I referenced my example https://gist.github.com/patrickdawson/da3ff09835dfc0c19caf2fd94c105f18) 20 times and did not see a single crash. It seems like you fixed it.

Hey. I ran my test program (I referenced my example https://gist.github.com/patrickdawson/da3ff09835dfc0c19caf2fd94c105f18) 20 times and did not see a single crash. It seems like you fixed it.

@bagder

This comment has been minimized.

Show comment Hide comment
@bagder

bagder Dec 5, 2017

Member

Awesome, thanks for verifying this. I'll clean up, squash some commits somewhat and merge into master soonish.

Member

bagder commented Dec 5, 2017

Awesome, thanks for verifying this. I'll clean up, squash some commits somewhat and merge into master soonish.

@bagder bagder closed this in 07cb27c Dec 5, 2017

JohnDeHelian pushed a commit to JohnDeHelian/curl that referenced this issue Dec 7, 2017

conncache: fix several lock issues
If the lock is released before the dealings with the bundle is over, it may
have changed by another thread in the mean time.

Fixes #2132
Fixes #2151
Closes #2139

@lock lock bot locked as resolved and limited conversation to collaborators May 6, 2018

Sign up for free to subscribe to this conversation on GitHub. Already have an account? Sign in.